1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/******************************************************************** 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * COPYRIGHT: 31b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert * Copyright (c) 1999-2015, International Business Machines Corporation and 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * others. All Rights Reserved. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ********************************************************************/ 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/************************************************************************ 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Date Name Description 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 12/15/99 Madhu Creation. 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 01/12/2000 Madhu Updated for changed API and added new tests 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru************************************************************************/ 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "utypeinfo.h" // for 'typeid' to work 1327f654740f2a26ad62a5c155af9199af9e69b889claireho 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/brkiter.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/rbbi.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utf16.h" 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ucnv.h" 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/schriter.h" 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uniset.h" 26103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if !UCONFIG_NO_REGULAR_EXPRESSIONS 27103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/regex.h" 28103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/ustring.h" 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utext.h" 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "intltest.h" 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "rbbitst.h" 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <string.h> 34f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "charstr.h" 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uvector.h" 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uvectr32.h" 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdio.h> 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include <stdlib.h> 3954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/numfmt.h" 4054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/uscript.h" 411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#include "cmemory.h" 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define TEST_ASSERT(x) {if (!(x)) { \ 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 46c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 476d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------- 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// runIndexedTest 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------- 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 54103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 55103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// Note: Before adding new tests to this file, check whether the desired test data can 56103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// simply be added to the file testdata/rbbitest.txt. In most cases it can, 57103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// it's much less work than writing a new test, diagnostic output in the event of failures 58103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// is good, and the test data file will is shared with ICU4J, so eventually the test 59103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// will run there as well, without additional effort. 60103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (exec) logln("TestSuite RuleBasedBreakIterator: "); 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (index) { 6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_FILE_IO 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0: name = "TestBug4153072"; 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestBug4153072(); break; 6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else 7050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case 0: name = "skip"; 7150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 7250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 7350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 74103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case 1: name = "skip"; 75103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius break; 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 2: name = "TestStatusReturn"; 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestStatusReturn(); break; 7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 7950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_FILE_IO 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 3: name = "TestUnicodeFiles"; 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestUnicodeFiles(); break; 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 4: name = "TestEmptyString"; 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestEmptyString(); break; 8450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else 8550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case 3: case 4: name = "skip"; 8650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 5: name = "TestGetAvailableLocales"; 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestGetAvailableLocales(); break; 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 6: name = "TestGetDisplayName"; 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestGetDisplayName(); break; 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_FILE_IO 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 7: name = "TestEndBehaviour"; 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestEndBehaviour(); break; 98103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case 8: case 9: case 10: name = "skip"; 99103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius break; 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 11: name = "TestWordBreaks"; 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestWordBreaks(); break; 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 12: name = "TestWordBoundary"; 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestWordBoundary(); break; 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 13: name = "TestLineBreaks"; 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestLineBreaks(); break; 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 14: name = "TestSentBreaks"; 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestSentBreaks(); break; 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 15: name = "TestExtended"; 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestExtended(); break; 11050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else 11150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; 11250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 11350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 11450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 115103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 11654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius case 16: 11754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius name = "TestMonkey"; if(exec) TestMonkey(params); break; 118103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#else 11950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case 16: 120103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius name = "skip"; break; 121103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif 12250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_FILE_IO 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 17: name = "TestBug3818"; 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestBug3818(); break; 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else 127103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case 17: name = "skip"; 12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 12950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 131103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case 18: name = "skip"; 132103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius break; 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 19: name = "TestDebug"; 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(exec) TestDebug(); break; 13554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius case 20: name = "skip"; 13654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius break; 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_FILE_IO 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 21: name = "TestBug5775"; 140b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (exec) TestBug5775(); break; 14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#else 142103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case 21: name = "skip"; 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 145103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1468393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius case 22: name = "TestBug9983"; 1478393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if (exec) TestBug9983(); break; 148b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho case 23: name = "TestDictRules"; 14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (exec) TestDictRules(); break; 150b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho case 24: name = "TestBug5532"; 15127f654740f2a26ad62a5c155af9199af9e69b889claireho if (exec) TestBug5532(); break; 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: name = ""; break; //needed to end loop 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------------------------------------- 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// class BITestData Holds a set of Break iterator test data and results 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Includes 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// - the string data to be broken 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// - a vector of the expected break positions. 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// - a vector of source line numbers for the data, 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// (to help see where errors occured.) 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// - The expected break tag values. 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// - Vectors of actual break positions and tag values. 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// - Functions for comparing actual with expected and 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// reporting errors. 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass BITestData { 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString fDataToBreak; 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector fExpectedBreakPositions; 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector fExpectedTags; 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector fLineNum; 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector fActualBreakPositions; // Test Results. 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector fActualTags; 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BITestData(UErrorCode &status); 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void checkResults(const char *heading, RBBITest *test); 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void clearResults(); 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Constructor. 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruBITestData::BITestData(UErrorCode &status) 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fActualTags(status) 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// addDataChunk. Add a section (non-breaking) piece if data to the test data. 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The macro form collects the line number, which is helpful 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// when tracking down failures. 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// A null data item is inserted at the start of each test's data 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// to put the starting zero into the data list. The position saved for 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// each non-null item is its ending position. 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) {return;} 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (data != NULL) { 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fDataToBreak.append(CharsToUnicodeString(data)); 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fExpectedTags.addElement(tag, status); 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLineNum.addElement(lineNum, status); 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// checkResults. Compare the actual and expected break positions, report any differences. 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid BITestData::checkResults(const char *heading, RBBITest *test) { 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expectedIndex = 0; 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t actualIndex = 0; 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If we've run through both the expected and actual results vectors, we're done. 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // break out of the loop. 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedIndex >= fExpectedBreakPositions.size() && 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru actualIndex >= fActualBreakPositions.size()) { 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedIndex >= fExpectedBreakPositions.size()) { 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru err(heading, test, expectedIndex-1, actualIndex); 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru actualIndex++; 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (actualIndex >= fActualBreakPositions.size()) { 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru err(heading, test, expectedIndex, actualIndex-1); 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedIndex++; 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru err(heading, test, expectedIndex, actualIndex); 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Try to resync the positions of the indices, to avoid a rash of spurious erros. 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru actualIndex++; 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedIndex++; 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru heading, fLineNum.elementAt(expectedIndex), 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru actualIndex++; 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedIndex++; 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// err - An error was found. Report it, along with information about where the 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// incorrectly broken test data appeared in the source file. 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t actual = fActualBreakPositions.elementAti(actualIdx); 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t o = 0; 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t line = fLineNum.elementAti(expectedIdx); 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedIdx > 0) { 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The line numbers are off by one because a premature break occurs somewhere 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // within the previous item, rather than at the start of the current (expected) item. 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We want to report the offset of the unexpected break from the start of 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this previous item. 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (actual < expected) { 285c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid BITestData::clearResults() { 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fActualBreakPositions.removeAllElements(); 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fActualTags.removeAllElements(); 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------------- 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// RBBITest constructor and destructor 301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------------- 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBITest::RBBITest() { 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBITest::~RBBITest() { 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//----------------------------------------------------------------------------------- 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Test for status {tag} return value from break rules. 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TODO: a more thorough test. 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//----------------------------------------------------------------------------------- 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestStatusReturn() { 318c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString rulesString1("$Letters = [:L:];\n" 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "$Numbers = [:N:];\n" 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "$Letters+{1};\n" 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "$Numbers+{2};\n" 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Help\\ {4}/me\\!;\n" 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "[^$Letters $Numbers];\n" 324c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "!.*;\n", -1, US_INV); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString testString1 = "abc123..abc Help me Help me!"; 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 01234567890123456789012345678 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status=U_ZERO_ERROR; 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseError parseError; 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 33359d709d503bab6e2b61931737e662dd293b40578ccornelius BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 3356d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("FAIL : in construction - %s", u_errorName(status)); 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t pos; 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i = 0; 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(testString1); 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos != bounds1[i]) { 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int tag = bi->getRuleStatus(); 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tag != brkStatus[i]) { 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru i++; 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 358f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstatic void printStringBreaks(UText *tstr, int expected[], int expectedCount) { 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char name[100]; 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("code alpha extend alphanum type word sent line name\n"); 362f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int nextExpectedIndex = 0; 363f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utext_setNativeIndex(tstr, 0); 364f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { 365f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { 366f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius printf("------------------------------------------------ %d\n", j); 367f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ++nextExpectedIndex; 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 369f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 370f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UChar32 c = utext_next32(tstr); 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_isUAlphabetic(c), 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_isalnum(c), 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_charType(c), 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SHORT_PROPERTY_NAME), 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getPropertyValueName(UCHAR_WORD_BREAK, 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getIntPropertyValue(c, 381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCHAR_WORD_BREAK), 382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SHORT_PROPERTY_NAME), 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getIntPropertyValue(c, 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCHAR_SENTENCE_BREAK), 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SHORT_PROPERTY_NAME), 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getPropertyValueName(UCHAR_LINE_BREAK, 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_getIntPropertyValue(c, 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UCHAR_LINE_BREAK), 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_SHORT_PROPERTY_NAME), 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name); 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 396f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstatic void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) { 397f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UErrorCode status = U_ZERO_ERROR; 398f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UText *tstr = NULL; 399f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius tstr = utext_openConstUnicodeString(NULL, &ustr, &status); 400f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status)) { 401f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status)); 402f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 403f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 404f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius printStringBreaks(tstr, expected, expectedCount); 405f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utext_close(tstr); 406f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 407f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 408f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestBug3818() { 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Four Thai words... 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString thaiStr(thaiWordData); 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 41759d709d503bab6e2b61931737e662dd293b40578ccornelius BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status) || bi == NULL) { 4196d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(thaiStr); 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t startOfSecondWord = bi->following(1); 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (startOfSecondWord != 4) { 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Fail at file %s, line %d expected start of word at 4, got %d", 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru __FILE__, __LINE__, startOfSecondWord); 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru startOfSecondWord = bi->following(0); 430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (startOfSecondWord != 4) { 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Fail at file %s, line %d expected start of word at 4, got %d", 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru __FILE__, __LINE__, startOfSecondWord); 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// generalIteratorTest Given a break iterator and a set of test data, 440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Run the tests and report the results. 441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------- 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi.setText(td.fDataToBreak); 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testFirstAndNext(bi, td); 449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testLastAndPrevious(bi, td); 451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testFollowing(bi, td); 453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testPreceding(bi, td); 454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testIsBoundary(bi, td); 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru doMultipleSelectionTest(bi, td); 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// testFirstAndNext. Run the iterator forwards in the obvious first(), next() 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// kind of loop. 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t p; 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastP = -1; 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tag; 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("Test first and next"); 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi.setText(td.fDataToBreak); 472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.clearResults(); 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.addElement(p, status); // Save result. 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.addElement(tag, status); 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p <= lastP) { 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the iterator is not making forward progress, stop. 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No need to raise an error here, it'll be detected in the normal check of results. 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastP = p; 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.checkResults("testFirstAndNext", this); 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TestLastAndPrevious. Run the iterator backwards, starting with last(). 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t p; 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastP = 0x7ffffffe; 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tag; 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 499c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru logln("Test last and previous"); 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi.setText(td.fDataToBreak); 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.clearResults(); 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save break position. Insert it at start of vector of results, shoving 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // already-saved results further towards the end. 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.insertElementAt(p, 0, status); 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // bi.previous(); // TODO: Why does this fix things up???? 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // bi.next(); 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.insertElementAt(tag, 0, status); 511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p >= lastP) { 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the iterator is not making progress, stop. 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No need to raise an error here, it'll be detected in the normal check of results. 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastP = p; 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.checkResults("testLastAndPrevious", this); 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t p; 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tag; 527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastP = -2; // A value that will never be returned as a break position. 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // cannot be -1; that is returned for DONE. 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("testFollowing():"); 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi.setText(td.fDataToBreak); 533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.clearResults(); 534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save the starting point, since we won't get that out of following. 536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p = bi.first(); 537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.addElement(p, status); // Save result. 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.addElement(tag, status); 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p = bi.following(i); 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p != lastP) { 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p == RuleBasedBreakIterator::DONE) { 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've reached a new break position. Save it. 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.addElement(p, status); // Save result. 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.addElement(tag, status); 551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastP = p; 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop normally exits by means of the break in the middle. 555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Make sure that the index was at the correct position for the break iterator to have 556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // returned DONE. 557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i != td.fDataToBreak.length()) { 558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("testFollowing(): iterator returned DONE prematurely."); 559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Full check of all results. 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.checkResults("testFollowing", this); 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t p; 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tag; 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastP = 0x7ffffffe; 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("testPreceding():"); 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi.setText(td.fDataToBreak); 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.clearResults(); 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p = bi.last(); 579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.addElement(p, status); 580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.addElement(tag, status); 582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = td.fDataToBreak.length(); i>=-1; i--) { 584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p = bi.preceding(i); 585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p != lastP) { 586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p == RuleBasedBreakIterator::DONE) { 587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've reached a new break position. Save it. 590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.insertElementAt(p, 0, status); 591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastP = p; 592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.insertElementAt(tag, 0, status); 594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop normally exits by means of the break in the middle. 597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Make sure that the index was at the correct position for the break iterator to have 598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // returned DONE. 599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i != 0) { 600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("testPreceding(): iterator returned DONE prematurely."); 601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Full check of all results. 604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.checkResults("testPreceding", this); 605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tag; 613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("testIsBoundary():"); 615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi.setText(td.fDataToBreak); 616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.clearResults(); 617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i <= td.fDataToBreak.length(); i++) { 619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bi.isBoundary(i)) { 620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualBreakPositions.addElement(i, status); // Save result. 621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tag = bi.getRuleStatus(); 622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.fActualTags.addElement(tag, status); 623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru td.checkResults("testIsBoundary: ", this); 626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru iterator.setText(td.fDataToBreak); 633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t offset = iterator.first(); 636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t testOffset; 637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 0; 638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (*testIterator != iterator) 642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("clone() or operator!= failed: two clones compared unequal"); 643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testOffset = testIterator->first(); 646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testOffset = testIterator->next(count); 647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != testOffset) 648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != RuleBasedBreakIterator::DONE) { 651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count++; 652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = iterator.next(); 653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count > 10000 || offset == -1) { 657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("operator== failed too many times. Stopping test."); 658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset == -1) { 659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } while (offset != RuleBasedBreakIterator::DONE); 666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // now do it backwards... 668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = iterator.last(); 669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count = 0; 670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testOffset = testIterator->last(); 673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testOffset = testIterator->next(count); // next() with a negative arg is same as previous 674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != testOffset) 675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (offset != RuleBasedBreakIterator::DONE) { 678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count--; 679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offset = iterator.previous(); 680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } while (offset != RuleBasedBreakIterator::DONE); 682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete testIterator; 684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------- 688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// other tests 690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------- 692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestEmptyString() 693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString text = ""; 695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BITestData x(status); 698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ADD_DATACHUNK(x, "", 0, status); // Break at start of data 699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) 701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 7026d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru generalIteratorTest(*bi, x); 706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestGetAvailableLocales() 710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t locCount = 0; 712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const Locale* locList = BreakIterator::getAvailableLocales(locCount); 713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (locCount == 0) 7156d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("getAvailableLocales() returned an empty list!"); 716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Just make sure that it's returning good memory. 717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i; 718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < locCount; ++i) { 719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln(locList[i].getName()); 720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//Testing the BreakIterator::getDisplayName() function 724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestGetDisplayName() 725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString result; 727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator::getDisplayName(Locale::getUS(), result); 729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 7306d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru + result); 732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (result != "French (France)") 7356d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru + result); 737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Test End Behaviour 740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @bug 4068137 741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestEndBehaviour() 743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString testString("boo."); 746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) 748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 7496d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru wb->setText(testString); 753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (wb->first() != 0) 755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Didn't get break at beginning of string."); 756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (wb->next() != 3) 757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Didn't get break before period in \"boo.\""); 758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (wb->current() != 4 && wb->next() != 4) 759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Didn't get break at end of string."); 760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete wb; 761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * @bug 4153072 764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestBug4153072() { 766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) 769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 7706d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString str("...Hello, World!..."); 774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t begin = 3; 775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = str.length() - 3; 776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool onBoundary; 777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru iter->adoptText(textIterator); 780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int index; 781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: with the switch to UText, there is no way to restrict the 782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // iteration range to begin at an index other than zero. 783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // String character iterators created with a non-zero bound are 784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // treated by RBBI as being empty. 785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (index = -1; index < begin + 1; ++index) { 786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru onBoundary = iter->isBoundary(index); 787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (index == 0? !onBoundary : onBoundary) { 788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru " and begin index = " + begin); 790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete iter; 793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Test for problem reported by Ashok Matoria on 9 July 2007 798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// One.<kSoftHyphen><kSpace>Two. 799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Sentence break at start (0) and then on calling next() it breaks at 801c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 'T' of "Two". Now, at this point if I do next() and 802c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestBug5775() { 805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 8086d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru if (U_FAILURE(status)) { 8096d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru return; 8106d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru } 8116d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru// Check for status first for better handling of no data errors. 812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(bi != NULL); 8136d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru if (bi == NULL) { 814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 8166d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru 817c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString s("One.\\u00ad Two.", -1, US_INV); 818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 01234 56789 819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s = s.unescape(); 820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(s); 821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int pos = bi->next(); 822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(pos == 6); 823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->next(); 824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(pos == 10); 825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->previous(); 826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(pos == 6); 827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// RBBITest::Extended Run RBBI Tests from an external test data file 835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct TestParams { 839f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius BreakIterator *bi; // Break iterator is set while parsing test source. 840f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Changed out whenever test data changes break type. 841f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 842f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeString dataToBreak; // Data that is built up while parsing the test. 843f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString. 844f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak. 845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 *srcCol; 846f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 847f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UText *textToBreak; // UText, could be UTF8 or UTF16. 848f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets. 849f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CharString utf8String; // UTF-8 form of text to break. 850f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 851f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius TestParams(UErrorCode &status) : dataToBreak() { 852f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bi = NULL; 853f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius expectedBreaks = new UVector32(status); 854f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius srcLine = new UVector32(status); 855f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius srcCol = new UVector32(status); 856f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textToBreak = NULL; 857f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap = new UVector32(status); 858f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 859f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 860f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ~TestParams() { 861f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete bi; 862f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete expectedBreaks; 863f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete srcLine; 864f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete srcCol; 865f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utext_close(textToBreak); 866f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete textMap; 867f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 868f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 869f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t getSrcLine(int32_t bp); 870f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t getExpectedBreak(int32_t bp); 871f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t getSrcCol(int32_t bp); 872f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 873f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius void setUTF16(UErrorCode &status); 874f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius void setUTF8(UErrorCode &status); 875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 877f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// Append a UnicodeString to a CharString with UTF-8 encoding. 878f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// Substitute any invalid chars. 879f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. 880f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstatic void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) { 881f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status)) { 882f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 883f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 884f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t utf8Length; 885f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight. 886f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius src.getBuffer(), src.length(), // UTF-16 data 887f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0xfffd, NULL, // Substitution char, number of subs. 888f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius &status); 889f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 890f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 891f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 892f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius status = U_ZERO_ERROR; 893f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t capacity; 894f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status); 895f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius u_strToUTF8WithSub(buffer, utf8Length, NULL, 896f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius src.getBuffer(), src.length(), 897f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0xfffd, NULL, &status); 898f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius dest.append(buffer, utf8Length, status); 899f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 900f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 901f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 902f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid TestParams::setUTF16(UErrorCode &status) { 903f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); 904f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->removeAllElements(); 905f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int32_t i=0; i<dataToBreak.length(); i++) { 906f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (i == dataToBreak.getChar32Start(i)) { 907f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->addElement(i, status); 908f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } else { 909f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->addElement(-1, status); 910f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 911f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 912f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->addElement(dataToBreak.length(), status); 913f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius U_ASSERT(dataToBreak.length() + 1 == textMap->size()); 914f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 915f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 916f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 917f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid TestParams::setUTF8(UErrorCode &status) { 918f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status)) { 919f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 920f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 921f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utf8String.clear(); 922f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CharStringAppend(utf8String, dataToBreak, status); 923f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status); 924f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status)) { 925f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 926f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 927f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 928f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->removeAllElements(); 929f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t utf16Index = 0; 930f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (;;) { 931f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->addElement(utf16Index, status); 932f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UChar32 c32 = utext_current32(textToBreak); 933f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (c32 < 0) { 934f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius break; 935f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 936f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utf16Index += U16_LENGTH(c32); 937f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utext_next32(textToBreak); 938f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius while (textMap->size() < utext_getNativeIndex(textToBreak)) { 939f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius textMap->addElement(-1, status); 940f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 941f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 942f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size()); 943f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 944f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 945f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 946f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusint32_t TestParams::getSrcLine(int bp) { 947f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (bp >= textMap->size()) { 948f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp = textMap->size() - 1; 949f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 950f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t i = 0; 951f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for(; bp >= 0 ; --bp) { 952f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Move to a character boundary if we are not on one already. 953f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i = textMap->elementAti(bp); 954f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (i >= 0) { 955f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius break; 956f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 957f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 958f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return srcLine->elementAti(i); 959f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 960f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 961f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 962f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusint32_t TestParams::getExpectedBreak(int bp) { 963f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (bp >= textMap->size()) { 964f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return 0; 965f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 966f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t i = textMap->elementAti(bp); 967f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t retVal = 0; 968f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (i >= 0) { 969f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius retVal = expectedBreaks->elementAti(i); 970f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 971f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return retVal; 972f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 973f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 974f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 975f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusint32_t TestParams::getSrcCol(int bp) { 976f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (bp >= textMap->size()) { 977f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp = textMap->size() - 1; 978f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 979f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t i = 0; 980f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for(; bp >= 0; --bp) { 981f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Move bp to a character boundary if we are not on one already. 982f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i = textMap->elementAti(bp); 983f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (i >= 0) { 984f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius break; 985f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 986f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 987f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return srcCol->elementAti(i); 988f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 989f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 990f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 991f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid RBBITest::executeTest(TestParams *t, UErrorCode &status) { 992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t bp; 993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t prevBP; 994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i; 995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 996f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius TEST_ASSERT_SUCCESS(status); 997f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (U_FAILURE(status)) { 998f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 999f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 1000f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (t->bi == NULL) { 1002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1005f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius t->bi->setText(t->textToBreak, status); 1006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Run the iterator forward 1008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevBP = -1; 1010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 1011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevBP == bp) { 1012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fail for lack of forward progress. 1013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 1014f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1018f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Check that there we didn't miss an expected break between the last one 1019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and this one. 1020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=prevBP+1; i<bp; i++) { 1021f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(i) != 0) { 1022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[] = {0, i}; 1023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(t->dataToBreak, expected, 2); 1024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1025f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i)); 1026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check that the break we did find was expected 1030f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(bp) == 0) { 1031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[] = {0, bp}; 1032f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius printStringBreaks(t->textToBreak, expected, 2); 1033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1034f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The break was expected. 1037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check that the {nnn} tag value is correct. 1038f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t expectedTagVal = t->getExpectedBreak(bp); 1039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedTagVal == -1) { 1040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedTagVal = 0; 1041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1042f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t line = t->getSrcLine(bp); 1043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); 1044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (rs != expectedTagVal) { 1045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru " Actual, Expected status = %4d, %4d", 1047f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevBP = bp; 1052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Verify that there were no missed expected breaks after the last one found 1055f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) { 1056f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(i) != 0) { 1057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1058f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i)); 1059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Run the iterator backwards, verify that the same breaks are found. 1064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1065f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. 1066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevBP == bp) { 1068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fail for lack of progress. 1069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1070f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1074f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Check that we didn't miss an expected break between the last one 1075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and this one. (UVector returns zeros for index out of bounds.) 1076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=prevBP-1; i>bp; i--) { 1077f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(i) != 0) { 1078f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1079f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i)); 1080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check that the break we did find was expected 1084f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(bp) == 0) { 1085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1086f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The break was expected. 1089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check that the {nnn} tag value is correct. 1090f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t expectedTagVal = t->getExpectedBreak(bp); 1091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedTagVal == -1) { 1092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedTagVal = 0; 1093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1094f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int line = t->getSrcLine(bp); 1095f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t rs = t->bi->getRuleStatus(); 1096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (rs != expectedTagVal) { 1097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru " Actual, Expected status = %4d, %4d", 1099f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevBP = bp; 1104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Verify that there were no missed breaks prior to the last one found 1107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=prevBP-1; i>=0; i--) { 1108f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(i) != 0) { 1109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1110f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i)); 1111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 11138393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 11148393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius // Check isBoundary() 1115f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1116f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UBool boundaryExpected = (t->getExpectedBreak(i) != 0); 11178393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius UBool boundaryFound = t->bi->isBoundary(i); 11188393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if (boundaryExpected != boundaryFound) { 11198393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 11208393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius " Expected, Actual= %s, %s", 1121f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i), 11228393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 11238393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11248393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11258393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 11268393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius // Check following() 1127f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 11288393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius int32_t actualBreak = t->bi->following(i); 11298393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius int32_t expectedBreak = BreakIterator::DONE; 1130f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { 1131f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(j) != 0) { 11328393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius expectedBreak = j; 11338393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius break; 11348393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11358393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11368393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if (expectedBreak != actualBreak) { 11378393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius errln("following(%d) incorrect. File line,col= %4d,%4d\n" 11388393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius " Expected, Actual= %d, %d", 1139f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 11408393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11418393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11428393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 11438393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius // Check preceding() 1144f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { 11458393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius int32_t actualBreak = t->bi->preceding(i); 11468393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius int32_t expectedBreak = BreakIterator::DONE; 11478393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 1148f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 1149f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // preceding(trailing byte) will return the index of some preceding code point, 1150f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // not the lead byte of the current code point, even though that has a smaller index. 1151f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Therefore, start looking at the expected break data not at i-1, but at 1152f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // the start of code point index - 1. 1153f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utext_setNativeIndex(t->textToBreak, i); 1154f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t j = utext_getNativeIndex(t->textToBreak) - 1; 1155f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (; j >= 0; j--) { 1156f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (t->getExpectedBreak(j) != 0) { 11578393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius expectedBreak = j; 11588393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius break; 11598393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11608393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11618393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if (expectedBreak != actualBreak) { 11628393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 11638393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius " Expected, Actual= %d, %d", 1164f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 11658393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 11668393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 1167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestExtended() { 1171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Locale locale(""); 1174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString rules; 1176f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius TestParams tp(status); 1177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 11781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status); 11796d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru if (U_FAILURE(status)) { 11806d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 11816d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru } 1182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open and read the test data file. 1186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *testDataDirectory = IntlTest::getSourceTestData(status); 1188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char testFileName[1000]; 1189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Can't open test data. Path too long."); 1191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strcpy(testFileName, testDataDirectory); 1194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strcat(testFileName, "rbbitst.txt"); 1195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int len; 1197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; /* something went wrong, error already output */ 1200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Put the test data into a UnicodeString 1207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString testString(FALSE, testFile, len); 1209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru enum EParseState{ 1211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru PARSE_COMMENT, 1212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru PARSE_TAG, 1213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru PARSE_DATA, 1214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru PARSE_NUM 1215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_TAG; 1217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru EParseState savedState = PARSE_TAG; 1219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_LF = 0x0a; 1221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_CR = 0x0d; 1222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_HASH = 0x23; 1223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /*static const UChar CH_PERIOD = 0x2e;*/ 1224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_LT = 0x3c; 1225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_GT = 0x3e; 1226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_BACKSLASH = 0x5c; 1227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const UChar CH_BULLET = 0x2022; 1228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lineNum = 1; 1230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t colStart = 0; 1231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t column = 0; 1232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t charIdx = 0; 1233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (charIdx = 0; charIdx < len; ) { 1237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 1238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar c = testString.charAt(charIdx); 1239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx++; 1240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // treat CRLF as a unit 1242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = CH_LF; 1243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx++; 1244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_LF || c == CH_CR) { 1246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lineNum++; 1247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru colStart = charIdx; 1248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru column = charIdx - colStart + 1; 1250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (parseState) { 1252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case PARSE_COMMENT: 1253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == 0x0a || c == 0x0d) { 1254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = savedState; 1255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case PARSE_TAG: 1259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_HASH) { 1261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_COMMENT; 1262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru savedState = PARSE_TAG; 1263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_isUWhiteSpace(c)) { 1266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete tp.bi; 1270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.bi = BreakIterator::createWordInstance(locale, status); 1271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 5; 1272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete tp.bi; 1276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.bi = BreakIterator::createCharacterInstance(locale, status); 1277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 5; 1278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete tp.bi; 1282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.bi = BreakIterator::createLineInstance(locale, status); 1283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 5; 1284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete tp.bi; 1288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.bi = NULL; 1289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.bi = BreakIterator::createSentenceInstance(locale, status); 1290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 5; 1291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete tp.bi; 1295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.bi = BreakIterator::createTitleInstance(locale, status); 1296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 6; 1297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // <locale loc_name> 1301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru localeMatcher.reset(testString); 1302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (localeMatcher.lookingAt(charIdx-1, status)) { 1303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString localeName = localeMatcher.group(1, status); 1304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char localeName8[100]; 1305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru locale = Locale::createFromName(localeName8); 13078393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius charIdx += localeMatcher.group(0, status).length() - 1; 1308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_DATA; 1313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 5; 1314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.dataToBreak = ""; 1315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->removeAllElements(); 1316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->removeAllElements(); 1317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->removeAllElements(); 1318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("line %d: Tag expected in test file.", lineNum); 1322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_COMMENT; 1323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru savedState = PARSE_DATA; 1324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto end_test; // Stop the test. 1325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case PARSE_DATA: 1329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_BULLET) { 1330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t breakIdx = tp.dataToBreak.length(); 1331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->setSize(breakIdx+1); 1332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->setElementAt(-1, breakIdx); 1333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->setSize(breakIdx+1); 1334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->setElementAt(lineNum, breakIdx); 1335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->setSize(breakIdx+1); 1336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->setElementAt(column, breakIdx); 1337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add final entry to mappings from break location to source file position. 1342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Need one extra because last break position returned is after the 1343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // last char in the data, not at the last char. 1344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->addElement(lineNum, status); 1345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->addElement(column, status); 1346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_TAG; 1348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx += 6; 1349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // RUN THE TEST! 1351f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius status = U_ZERO_ERROR; 1352f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius tp.setUTF16(status); 1353f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius executeTest(&tp, status); 1354f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius TEST_ASSERT_SUCCESS(status); 1355f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1356f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Run again, this time with UTF-8 text wrapped in a UText. 1357f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius status = U_ZERO_ERROR; 1358f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius tp.setUTF8(status); 1359f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius TEST_ASSERT_SUCCESS(status); 1360f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius executeTest(&tp, status); 1361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1364c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get the code point from the name and insert it into the test data. 1367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Damn, no API takes names in Unicode !!! 1368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we've got to take it back to char *) 1369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nameLength = nameEndIdx - (charIdx+2); 1371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char charNameBuf[200]; 1372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 theChar = -1; 1373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (nameEndIdx != -1) { 1374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charNameBuf[sizeof(charNameBuf)-1] = 0; 1377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru theChar = -1; 1380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (theChar == -1) { 1383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Error in named character in test file at line %d, col %d", 1384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lineNum, column); 1385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Named code point was recognized. Insert it 1387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // into the test data. 1388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.dataToBreak.append(theChar); 1389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tp.dataToBreak.length() > tp.srcLine->size()) { 1390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->addElement(lineNum, status); 1391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->addElement(column, status); 1392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (nameEndIdx > charIdx) { 1395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx = nameEndIdx+1; 1396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.compare(charIdx-1, 2, "<>") == 0) { 1405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx++; 1406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t breakIdx = tp.dataToBreak.length(); 1407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->setSize(breakIdx+1); 1408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->setElementAt(-1, breakIdx); 1409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->setSize(breakIdx+1); 1410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->setElementAt(lineNum, breakIdx); 1411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->setSize(breakIdx+1); 1412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->setElementAt(column, breakIdx); 1413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_LT) { 1417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tagValue = 0; 1418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_NUM; 1419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_COMMENT; 1424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru savedState = PARSE_DATA; 1425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_BACKSLASH) { 1429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for \ at end of line, a line continuation. 1430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Advance over (discard) the newline 1431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cp = testString.char32At(charIdx); 1432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have a CR LF 1434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Need an extra increment of the input ptr to move over both of them 1435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx++; 1436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (cp == CH_LF || cp == CH_CR) { 1438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lineNum++; 1439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru colStart = charIdx; 1440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx++; 1441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Let unescape handle the back slash. 1445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru cp = testString.unescapeAt(charIdx); 1446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (cp != -1) { 1447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Escape sequence was recognized. Insert the char 1448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // into the test data. 1449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.dataToBreak.append(cp); 1450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tp.dataToBreak.length() > tp.srcLine->size()) { 1451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->addElement(lineNum, status); 1452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->addElement(column, status); 1453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Not a recognized backslash escape sequence. 1459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Take the next char as a literal. 1460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: Should this be an error? 1461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = testString.charAt(charIdx); 1462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charIdx = testString.moveIndex32(charIdx, 1); 1463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal, non-escaped data char. 1466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.dataToBreak.append(c); 1467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save the mapping from offset in the data to line/column numbers in 1469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the original input file. Will be used for better error messages only. 1470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If there's an expected break before this char, the slot in the mapping 1471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // vector will already be set for this char; don't overwrite it. 1472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tp.dataToBreak.length() > tp.srcLine->size()) { 1473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->addElement(lineNum, status); 1474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->addElement(column, status); 1475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case PARSE_NUM: 1480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are parsing an expected numeric tag value, like <1234>, 1481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // within a chunk of data. 1482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_isUWhiteSpace(c)) { 1483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c == CH_GT) { 1487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished the number. Add the info to the expected break data, 1488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and switch parse state back to doing plain data. 1489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_DATA; 1490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tagValue == 0) { 1491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tagValue = -1; 1492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t breakIdx = tp.dataToBreak.length(); 1494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->setSize(breakIdx+1); 1495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->setSize(breakIdx+1); 1497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcLine->setElementAt(lineNum, breakIdx); 1498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->setSize(breakIdx+1); 1499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tp.srcCol ->setElementAt(column, breakIdx); 1500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_isdigit(c)) { 1504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tagValue = tagValue*10 + u_charDigitValue(c); 1505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Syntax Error in test file at line %d, col %d", 1509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lineNum, column); 1510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru parseState = PARSE_COMMENT; 1511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto end_test; // Stop the test 1512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho dataerrln("ICU Error %s while parsing test file at line %d.", 1518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_errorName(status), lineNum); 1519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 1520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto end_test; // Stop the test 1521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruend_test: 1526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete [] testFile; 1527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 1528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------- 1532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 153350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// TestDictRules create a break iterator from source rules that includes a 153450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// dictionary range. Regression for bug #7130. Source rules 153550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// do not declare a break iterator type (word, line, sentence, etc. 153650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// but the dictionary code, without a type, would loop. 153750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 153850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho//------------------------------------------------------------------------------- 153950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid RBBITest::TestDictRules() { 154050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const char *rules = "$dictionary = [a-z]; \n" 154150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "!!forward; \n" 154250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "$dictionary $dictionary; \n" 154350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "!!reverse; \n" 154450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "$dictionary $dictionary; \n"; 154550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const char *text = "aa"; 154650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode status = U_ZERO_ERROR; 154750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UParseError parseError; 154850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 154950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho RuleBasedBreakIterator bi(rules, parseError, status); 155050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (U_SUCCESS(status)) { 155150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString utext = text; 155250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho bi.setText(utext); 155350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t position; 155450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loops; 155550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for (loops = 0; loops<10; loops++) { 155650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho position = bi.next(); 155750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (position == RuleBasedBreakIterator::DONE) { 155850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 155950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 156050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 156150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho TEST_ASSERT(loops == 1); 156250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 156350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 156450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 156550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 156650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 156750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 156850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 156950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho//------------------------------------------------------------------------------- 157050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 1571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// ReadAndConvertFile Read a text data file, convert it to UChars, and 1572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// return the datain one big UChar * buffer, which the caller must delete. 1573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// parameters: 1575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// fileName: the name of the file, with no directory part. The test data directory 1576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// is assumed. 1577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// ulen an out parameter, receives the actual length (in UChars) of the file data. 1578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// encoding The file encoding. If the file contains a BOM, that will override the encoding 1579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// specified here. The BOM, if it exists, will be stripped from the returned data. 1580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Pass NULL for the system default encoding. 1581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// status 1582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// returns: 1583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The file data, converted to UChar. 1584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The caller must delete this when done with 1585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// delete [] theBuffer; 1586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TODO: This is a clone of RegexTest::ReadAndConvertFile. 1588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Move this function to some common place. 1589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------- 1591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *retPtr = NULL; 1593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char *fileBuf = NULL; 1594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UConverter* conv = NULL; 1595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru FILE *f = NULL; 1596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ulen = 0; 1598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return retPtr; 1600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open the file. 1604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru f = fopen(fileName, "rb"); 1606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (f == 0) { 16076d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("Error opening test data file %s\n", fileName); 1608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_FILE_ACCESS_ERROR; 1609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 1610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Read it in 1613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int fileSize; 1615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int amt_read; 1616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fseek( f, 0, SEEK_END); 1618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileSize = ftell(f); 1619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileBuf = new char[fileSize]; 1620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fseek(f, 0, SEEK_SET); 1621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru amt_read = fread(fileBuf, 1, fileSize, f); 1622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (amt_read != fileSize || fileSize <= 0) { 1623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Error reading test data file."); 1624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto cleanUpAndReturn; 1625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look for a Unicode Signature (BOM) on the data just read 1629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t signatureLength; 1631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char * fileBufC; 1632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char* bomEncoding; 1633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileBufC = fileBuf; 1635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bomEncoding = ucnv_detectUnicodeSignature( 1636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileBuf, fileSize, &signatureLength, &status); 1637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(bomEncoding!=NULL ){ 1638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileBufC += signatureLength; 1639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileSize -= signatureLength; 1640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru encoding = bomEncoding; 1641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open a converter to take the rule file to UTF-16 1645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru conv = ucnv_open(encoding, &status); 1647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru goto cleanUpAndReturn; 1649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Convert the rules to UChar. 1653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Preflight first to determine required buffer size. 1654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ulen = ucnv_toUChars(conv, 1656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru NULL, // dest, 1657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 0, // destCapacity, 1658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileBufC, 1659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileSize, 1660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status); 1661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (status == U_BUFFER_OVERFLOW_ERROR) { 1662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Buffer Overflow is expected from the preflight operation. 1663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = U_ZERO_ERROR; 1664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru retPtr = new UChar[ulen+1]; 1666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_toUChars(conv, 1667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru retPtr, // dest, 1668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ulen+1, 1669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileBufC, 1670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileSize, 1671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru &status); 1672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QuerucleanUpAndReturn: 1675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fclose(f); 1676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete []fileBuf; 1677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ucnv_close(conv); 1678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1680b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete []retPtr; 1681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru retPtr = 0; 1682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ulen = 0; 1683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 1684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return retPtr; 1685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------------------- 1690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Run tests from each of the boundary test data files distributed by the Unicode Consortium 1692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------- 1694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestUnicodeFiles() { 1695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator *bi; 1696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 169827f654740f2a26ad62a5c155af9199af9e69b889claireho bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 1701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru runUnicodeTestData("GraphemeBreakTest.txt", bi); 1702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 1704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 170527f654740f2a26ad62a5c155af9199af9e69b889claireho bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 1708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru runUnicodeTestData("WordBreakTest.txt", bi); 1709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 1711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 171227f654740f2a26ad62a5c155af9199af9e69b889claireho bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 1715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru runUnicodeTestData("SentenceBreakTest.txt", bi); 1716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 1718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 171927f654740f2a26ad62a5c155af9199af9e69b889claireho bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 1722c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru runUnicodeTestData("LineBreakTest.txt", bi); 1723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 1725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 17281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Check for test cases from the Unicode test data files that are known to fail 17291b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// and should be skipped because ICU is not yet able to fully implement the spec. 17301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// See ticket #7270. 17311b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 17321b7d32f919554dda9c193b32188251337bc756f1Fredrik RoubertUBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { 17331b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file. 17341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198 17351b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202 17361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214 17371b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246 17381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298 17391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302 17401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert }; 17411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (strcmp(fileName, "LineBreakTest.txt") != 0) { 17421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return FALSE; 17431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 17441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 17451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) { 17461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (testCase == UnicodeString(badTestCases[i])) { 17471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return logKnownIssue("7270"); 17481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 17491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 17501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return FALSE; 17511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 17521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 17531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 1754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------------------- 1755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Run tests from one of the boundary test data files distributed by the Unicode Consortium 1757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------- 1759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 1762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open and read the test data file, put it into a UnicodeString. 1765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *testDataDirectory = IntlTest::getSourceTestData(status); 1767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char testFileName[1000]; 1768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 17696d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru dataerrln("Can't open test data. Path too long."); 1770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strcpy(testFileName, testDataDirectory); 1773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru strcat(testFileName, fileName); 1774c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1775c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru logln("Opening data file %s\n", fileName); 1776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int len; 1778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1779c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (status != U_FILE_ACCESS_ERROR) { 1780c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1781c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru TEST_ASSERT(testFile != NULL); 1782c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status) || testFile == NULL) { 1784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; /* something went wrong, error already output */ 1785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString testFileAsString(TRUE, testFile, len); 1787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Parse the test data file using a regular expression. 1790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Each kind of token is recognized in its own capture group; what type of item was scanned 1791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is identified by which group had a match. 1792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1793c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caputure Group # 1 2 3 4 5 1794c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Parses this item: divide x hex digits comment \n unrecognized \n 1795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1796c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1797c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString testString; 1799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 breakPositions(status); 1800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int lineNumber = 1; 1801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan through each test case, building up the string to be broken in testString, 1808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and the positions that should be boundaries in the breakPositions vector. 1809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 181027f654740f2a26ad62a5c155af9199af9e69b889claireho int spin = 0; 1811c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while (tokenMatcher.find()) { 181227f654740f2a26ad62a5c155af9199af9e69b889claireho if(tokenMatcher.hitEnd()) { 181327f654740f2a26ad62a5c155af9199af9e69b889claireho /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 181427f654740f2a26ad62a5c155af9199af9e69b889claireho This occurred when the text file was corrupt (wasn't marked as UTF-8) 181527f654740f2a26ad62a5c155af9199af9e69b889claireho and caused an infinite loop here on EBCDIC systems! 181627f654740f2a26ad62a5c155af9199af9e69b889claireho */ 181727f654740f2a26ad62a5c155af9199af9e69b889claireho fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 181827f654740f2a26ad62a5c155af9199af9e69b889claireho // return; 181927f654740f2a26ad62a5c155af9199af9e69b889claireho } 1820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tokenMatcher.start(1, status) >= 0) { 1821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a divide sign, indicating a break position in the test data. 1822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (testString.length()>0) { 1823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPositions.addElement(testString.length(), status); 1824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else if (tokenMatcher.start(2, status) >= 0) { 1827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned an 'x', meaning no break at this position in the test data 1828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Nothing to be done here. 1829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else if (tokenMatcher.start(3, status) >= 0) { 1831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned Hex digits. Convert them to binary, append to the character data string. 1832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int length = hexNumber.length(); 1834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (length<=8) { 1835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char buf[10]; 1836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = (UChar32)strtol(buf, NULL, 16); 1838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c<=0x10ffff) { 1839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testString.append(c); 1840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileName, lineNumber); 1843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fileName, lineNumber); 1847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else if (tokenMatcher.start(4, status) >= 0) { 1850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned to end of a line, possibly skipping over a comment in the process. 1851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the line from the file contained test data, run the test now. 18521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { 1853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Clear out this test case. 1857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The string and breakPositions vector will be refilled as the next 1858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // test case is parsed. 1859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testString.remove(); 1860c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakPositions.removeAllElements(); 1861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lineNumber++; 1862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanner catchall. Something unrecognized appeared on the line. 1864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char token[16]; 1865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString uToken = tokenMatcher.group(0, status); 1866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru token[sizeof(token)-1] = 0; 1868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Clean up, in preparation for continuing with the next line. 1871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testString.remove(); 1872c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakPositions.removeAllElements(); 1873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lineNumber++; 1874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 1876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 1877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete [] testFile; 1882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------------------- 1886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// test data files. Do only a simple, forward-only check - 1889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// this test is mostly to check that ICU and the Unicode 1890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// data agree with each other. 1891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------------------- 1893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString &testString, // Text data to be broken 1895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 *breakPositions, // Positions where breaks should be found. 1896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator *bi) { 1897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t pos; // Break Position in the test string 1898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t expectedPos; // Expected break position (index into test string) 1900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(testString); 1902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->first(); 1903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->next(); 1904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (pos != BreakIterator::DONE) { 1906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedI >= breakPositions->size()) { 1907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Test file \"%s\", line %d, unexpected break found at position %d", 1908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testFileName, lineNumber, pos); 1909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedPos = breakPositions->elementAti(expectedI); 1912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos < expectedPos) { 1913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Test file \"%s\", line %d, unexpected break found at position %d", 1914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testFileName, lineNumber, pos); 1915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos > expectedPos) { 1918c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testFileName, lineNumber, expectedPos); 1920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->next(); 1923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedI++; 1924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1927c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------------------------------------------------- 1936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// classs RBBIMonkeyKind 1938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Monkey Test for Break Iteration 1940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Abstract interface class. Concrete derived classes independently 1941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// implement the break rules for different iterator types. 1942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The Monkey Test itself uses doesn't know which type of break iterator it is 1944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// testing, but works purely in terms of the interface defined here. 1945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------------------------------------------------- 1947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RBBIMonkeyKind { 1948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 1949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Return a UVector of UnicodeSets, representing the character classes used 1950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for this type of iterator. 1951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual UVector *charClasses() = 0; 1952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Set the test text on which subsequent calls to next() will operate 1954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual void setText(const UnicodeString &s) = 0; 1955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the next break postion, starting from the prev break position, or from zero. 1957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Return -1 after reaching end of string. 1958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual int32_t next(int32_t i) = 0; 1959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual ~RBBIMonkeyKind(); 1961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode deferredStatus; 1962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprotected: 1965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBIMonkeyKind(); 1966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprivate: 1968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 1969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBIMonkeyKind::RBBIMonkeyKind() { 1971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = U_ZERO_ERROR; 1972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBIMonkeyKind::~RBBIMonkeyKind() { 1975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 1979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Random Numbers. Similar to standard lib rand() and srand() 1981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Not using library to 1982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1. Get same results on all platforms. 1983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2. Get access to current seed, to more easily reproduce failures. 1984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//--------------------------------------------------------------------------------------- 1986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic uint32_t m_seed = 1; 1987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic uint32_t m_rand() 1989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 1990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru m_seed = m_seed * 1103515245 + 12345; 1991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return (uint32_t)(m_seed/65536) % 32768; 1992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------ 1996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of RBBIMonkeyKind. 1999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------ 2001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RBBICharMonkey: public RBBIMonkeyKind { 2002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 2003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBICharMonkey(); 2004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual ~RBBICharMonkey(); 2005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual UVector *charClasses(); 2006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual void setText(const UnicodeString &s); 2007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual int32_t next(int32_t i); 2008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprivate: 2009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector *fSets; 2010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fCRLFSet; 2012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fControlSet; 2013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fExtendSet; 201454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeSet *fRegionalIndicatorSet; 2015c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fPrependSet; 2016c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fSpacingSet; 2017c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fLSet; 2018c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fVSet; 2019c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fTSet; 2020c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fLVSet; 2021c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fLVTSet; 2022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fHangulSet; 2023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fAnySet; 2024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *fText; 2026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 2027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBICharMonkey::RBBICharMonkey() { 2030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 2031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fText = NULL; 2033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2034c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2035c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); 2036c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); 203754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 2038c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2039c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2040c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2041c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2042c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2043c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2044c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2045c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHangulSet = new UnicodeSet(); 2046c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHangulSet->addAll(*fLSet); 2047c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHangulSet->addAll(*fVSet); 2048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHangulSet->addAll(*fTSet); 2049c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHangulSet->addAll(*fLVSet); 2050c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHangulSet->addAll(*fLVTSet); 2051103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fAnySet = new UnicodeSet(0, 0x10ffff); 2052103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 2053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets = new UVector(status); 2054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fCRLFSet, status); 2055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fControlSet, status); 2056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fExtendSet, status); 205754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fSets->addElement(fRegionalIndicatorSet, status); 2058103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (!fPrependSet->isEmpty()) { 2059103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fSets->addElement(fPrependSet, status); 2060103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 2061c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fSpacingSet, status); 2062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fHangulSet, status); 2063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fAnySet, status); 2064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 2065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 2066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBICharMonkey::setText(const UnicodeString &s) { 2071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fText = &s; 2072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2076c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruint32_t RBBICharMonkey::next(int32_t prevPos) { 2077c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int p0, p1, p2, p3; // Indices of the significant code points around the 2078c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // break position being tested. The candidate break 2079c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // location is before p2. 2080c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2081c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int breakPos = -1; 2082c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2083c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2084c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2085c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(deferredStatus)) { 2086c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return -1; 2087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2088c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2089c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Previous break at end of string. return DONE. 2090c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (prevPos >= fText->length()) { 2091c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return -1; 2092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2093c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru p0 = p1 = p2 = p3 = prevPos; 2094c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c3 = fText->char32At(prevPos); 2095c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c0 = c1 = c2 = 0; 209659d709d503bab6e2b61931737e662dd293b40578ccornelius (void)p0; // suppress set but not used warning. 209759d709d503bab6e2b61931737e662dd293b40578ccornelius (void)c0; 2098c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2099c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop runs once per "significant" character position in the input text. 2100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 2101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Move all of the positions forward in the input string. 2102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru p0 = p1; c0 = c1; 2103c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru p1 = p2; c1 = c2; 2104c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru p2 = p3; c2 = c3; 2105c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2106c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Advancd p3 by one codepoint 2107c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru p3 = fText->moveIndex32(p3, 1); 2108c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c3 = fText->char32At(p3); 2109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (p1 == p2) { 2111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Still warming up the loop. (won't work with zero length strings, but we don't care) 2112c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (p2 == fText->length()) { 2115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Reached end of string. Always a break position. 2116c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule GB3 CR x LF 2120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No Extend or Format characters may appear between the CR and LF, 2121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which requires the additional check for p2 immediately following p1. 2122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB4). ( Control | CR | LF ) <break> 2128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fControlSet->contains(c1) || 2129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c1 == 0x0D || 2130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c1 == 0x0A) { 2131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB5) <break> ( Control | CR | LF ) 2135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fControlSet->contains(c2) || 2137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c2 == 0x0D || 2138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c2 == 0x0A) { 2139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB6) L x ( L | V | LV | LVT ) 2144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fLSet->contains(c1) && 2145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (fLSet->contains(c2) || 2146c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fVSet->contains(c2) || 2147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLVSet->contains(c2) || 2148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLVTSet->contains(c2))) { 2149c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2150c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2151c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2152c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB7) ( LV | V ) x ( V | T ) 2153c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2154c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (fVSet->contains(c2) || fTSet->contains(c2))) { 2155c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2156c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2157c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2158c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB8) ( LVT | T) x T 2159c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2160c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fTSet->contains(c2)) { 2161c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2162c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2163c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 216454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Rule (GB8a) Regional_Indicator x Regional_Indicator 216554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 216654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius continue; 216754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 216854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 2169c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB9) Numeric x ALetter 2170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fExtendSet->contains(c2)) { 2171c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2172c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB9a) x SpacingMark 2175c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fSpacingSet->contains(c2)) { 2176c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2177c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2178c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2179c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB9b) Prepend x 2180c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fPrependSet->contains(c1)) { 2181c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 2182c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2183c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2184c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (GB10) Any <break> Any 2185c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2186c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2187c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2188c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru breakPos = p2; 2189c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return breakPos; 2190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2193c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUVector *RBBICharMonkey::charClasses() { 2195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fSets; 2196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBICharMonkey::~RBBICharMonkey() { 2200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSets; 2201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCRLFSet; 2202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fControlSet; 2203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fExtendSet; 220454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius delete fRegionalIndicatorSet; 2205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fPrependSet; 2206c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fSpacingSet; 2207c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fLSet; 2208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fVSet; 2209c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fTSet; 2210c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fLVSet; 2211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fLVTSet; 2212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fHangulSet; 2213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fAnySet; 2214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------ 2217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// class RBBIWordMonkey Word Break specific implementation 2219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of RBBIMonkeyKind. 2220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------ 2222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RBBIWordMonkey: public RBBIMonkeyKind { 2223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 2224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBIWordMonkey(); 2225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual ~RBBIWordMonkey(); 2226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual UVector *charClasses(); 2227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual void setText(const UnicodeString &s); 2228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual int32_t next(int32_t i); 2229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprivate: 2230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector *fSets; 2231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2232c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fCRSet; 2233c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fLFSet; 2234c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fNewlineSet; 223559d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet *fRegionalIndicatorSet; 2236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fKatakanaSet; 223759d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet *fHebrew_LetterSet; 2238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fALetterSet; 223954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO(jungshik): Do we still need this change? 224054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 224159d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet *fSingle_QuoteSet; 224259d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet *fDouble_QuoteSet; 2243c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fMidNumLetSet; 2244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fMidLetterSet; 2245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fMidNumSet; 2246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fNumericSet; 2247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fFormatSet; 2248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fOtherSet; 2249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fExtendSet; 2250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fExtendNumLetSet; 225154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeSet *fDictionaryCjkSet; 2252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *fText; 2254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 2255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBIWordMonkey::RBBIWordMonkey() 2258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 2260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets = new UVector(status); 2262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2264c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 226654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status); 226754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Exclude Hangul syllables from ALetterSet during testing. 226854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Leave CJK dictionary characters out from the monkey tests! 226954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#if 0 227054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 227154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "[\\p{Line_Break = Complex_Context}" 227254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "-\\p{Grapheme_Cluster_Break = Extend}" 227354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "-\\p{Grapheme_Cluster_Break = Control}" 227454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "]]", 227554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius status); 227654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif 227759d709d503bab6e2b61931737e662dd293b40578ccornelius fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 227859d709d503bab6e2b61931737e662dd293b40578ccornelius fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 227959d709d503bab6e2b61931737e662dd293b40578ccornelius fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status); 228059d709d503bab6e2b61931737e662dd293b40578ccornelius fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 228154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fALetterSet->removeAll(*fDictionaryCjkSet); 228259d709d503bab6e2b61931737e662dd293b40578ccornelius fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status); 228359d709d503bab6e2b61931737e662dd293b40578ccornelius fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status); 228459d709d503bab6e2b61931737e662dd293b40578ccornelius fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 228559d709d503bab6e2b61931737e662dd293b40578ccornelius fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 228659d709d503bab6e2b61931737e662dd293b40578ccornelius fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 228754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test 228854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // we should figure out why 228959d709d503bab6e2b61931737e662dd293b40578ccornelius fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 229059d709d503bab6e2b61931737e662dd293b40578ccornelius fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 229159d709d503bab6e2b61931737e662dd293b40578ccornelius fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 229259d709d503bab6e2b61931737e662dd293b40578ccornelius fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet = new UnicodeSet(); 2295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 2296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 2297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->complement(); 2301c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOtherSet->removeAll(*fCRSet); 2302c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOtherSet->removeAll(*fLFSet); 2303c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOtherSet->removeAll(*fNewlineSet); 2304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fKatakanaSet); 230559d709d503bab6e2b61931737e662dd293b40578ccornelius fOtherSet->removeAll(*fHebrew_LetterSet); 2306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fALetterSet); 230759d709d503bab6e2b61931737e662dd293b40578ccornelius fOtherSet->removeAll(*fSingle_QuoteSet); 230859d709d503bab6e2b61931737e662dd293b40578ccornelius fOtherSet->removeAll(*fDouble_QuoteSet); 2309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fMidLetterSet); 2310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fMidNumSet); 2311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fNumericSet); 2312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fExtendNumLetSet); 2313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fFormatSet); 2314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fExtendSet); 231554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fOtherSet->removeAll(*fRegionalIndicatorSet); 2316c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Inhibit dictionary characters from being tested at all. 231754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fOtherSet->removeAll(*fDictionaryCjkSet); 2318c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 232059d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fCRSet, status); 232159d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fLFSet, status); 232259d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fNewlineSet, status); 232354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fSets->addElement(fRegionalIndicatorSet, status); 232459d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fHebrew_LetterSet, status); 232559d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fALetterSet, status); 232659d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fSingle_QuoteSet, status); 232759d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fDouble_QuoteSet, status); 232859d709d503bab6e2b61931737e662dd293b40578ccornelius //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 232959d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fMidLetterSet, status); 233059d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fMidNumLetSet, status); 233159d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fMidNumSet, status); 233259d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fNumericSet, status); 233359d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fFormatSet, status); 233459d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fExtendSet, status); 233559d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fOtherSet, status); 233659d709d503bab6e2b61931737e662dd293b40578ccornelius fSets->addElement(fExtendNumLetSet, status); 2337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 2339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 2340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBIWordMonkey::setText(const UnicodeString &s) { 2344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fText = &s; 2345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RBBIWordMonkey::next(int32_t prevPos) { 2349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int p0, p1, p2, p3; // Indices of the significant code points around the 2350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // break position being tested. The candidate break 2351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // location is before p2. 2352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int breakPos = -1; 2354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2356c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2357c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(deferredStatus)) { 2358c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return -1; 2359c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Prev break at end of string. return DONE. 2362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevPos >= fText->length()) { 2363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return -1; 2364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p0 = p1 = p2 = p3 = prevPos; 2366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c3 = fText->char32At(prevPos); 2367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c0 = c1 = c2 = 0; 236859d709d503bab6e2b61931737e662dd293b40578ccornelius (void)p0; // Suppress set but not used warning. 2369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop runs once per "significant" character position in the input text. 2371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 2372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Move all of the positions forward in the input string. 2373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p0 = p1; c0 = c1; 2374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p1 = p2; c1 = c2; 2375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p2 = p3; c2 = c3; 2376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Advancd p3 by X(Extend | Format)* Rule 4 2378c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 2380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p3 = fText->moveIndex32(p3, 1); 2381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c3 = fText->char32At(p3); 2382c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2383c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2384c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru }; 2385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); 2387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p1 == p2) { 2390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Still warming up the loop. (won't work with zero length strings, but we don't care) 2391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p2 == fText->length()) { 2394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Reached end of string. Always a break position. 2395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (3) CR x LF 2399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No Extend or Format characters may appear between the CR and LF, 2400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // which requires the additional check for p2 immediately following p1. 2401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2402c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (c1==0x0D && c2==0x0A) { 2403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2405c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2406c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (3a) Break before and after newlines (including CR and LF) 2407c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2408c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2409c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2410c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru }; 2411c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2412c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 2413c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru }; 2414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 241559d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 241659d709d503bab6e2b61931737e662dd293b40578ccornelius if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 241759d709d503bab6e2b61931737e662dd293b40578ccornelius (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 242159d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 242359d709d503bab6e2b61931737e662dd293b40578ccornelius if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 242459d709d503bab6e2b61931737e662dd293b40578ccornelius (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 242559d709d503bab6e2b61931737e662dd293b40578ccornelius (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 242659d709d503bab6e2b61931737e662dd293b40578ccornelius continue; 242759d709d503bab6e2b61931737e662dd293b40578ccornelius } 242859d709d503bab6e2b61931737e662dd293b40578ccornelius 242959d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 243059d709d503bab6e2b61931737e662dd293b40578ccornelius if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 243159d709d503bab6e2b61931737e662dd293b40578ccornelius (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 243259d709d503bab6e2b61931737e662dd293b40578ccornelius (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 243659d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (7a) Hebrew_Letter x Single_Quote 243759d709d503bab6e2b61931737e662dd293b40578ccornelius if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 243859d709d503bab6e2b61931737e662dd293b40578ccornelius continue; 243959d709d503bab6e2b61931737e662dd293b40578ccornelius } 244059d709d503bab6e2b61931737e662dd293b40578ccornelius 244159d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 244259d709d503bab6e2b61931737e662dd293b40578ccornelius if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 244359d709d503bab6e2b61931737e662dd293b40578ccornelius continue; 244459d709d503bab6e2b61931737e662dd293b40578ccornelius } 2445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 244659d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 244759d709d503bab6e2b61931737e662dd293b40578ccornelius if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (8) Numeric x Numeric 2452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fNumericSet->contains(c1) && 2453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumericSet->contains(c2)) { 2454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 245759d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (9) (ALetter | Hebrew_Letter) x Numeric 245859d709d503bab6e2b61931737e662dd293b40578ccornelius if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumericSet->contains(c2)) { 2460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 246359d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fNumericSet->contains(c1) && 246559d709d503bab6e2b61931737e662dd293b40578ccornelius (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 246959d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2470c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fNumericSet->contains(c0) && 247159d709d503bab6e2b61931737e662dd293b40578ccornelius (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumericSet->contains(c2)) { 2473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 247659d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fNumericSet->contains(c1) && 247859d709d503bab6e2b61931737e662dd293b40578ccornelius (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumericSet->contains(c3)) { 2480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (13) Katakana x Katakana 2484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fKatakanaSet->contains(c1) && 2485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fKatakanaSet->contains(c2)) { 2486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 248959d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 249059d709d503bab6e2b61931737e662dd293b40578ccornelius if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fExtendNumLetSet->contains(c2)) { 2493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 249454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 2495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 249659d709d503bab6e2b61931737e662dd293b40578ccornelius // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fExtendNumLetSet->contains(c1) && 249859d709d503bab6e2b61931737e662dd293b40578ccornelius (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 249959d709d503bab6e2b61931737e662dd293b40578ccornelius fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 250059d709d503bab6e2b61931737e662dd293b40578ccornelius continue; 250154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 250254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 250354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Rule 13c 250454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 250554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius continue; 250654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 2507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule 14. Break found here. 2509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos = p2; 2513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return breakPos; 2514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUVector *RBBIWordMonkey::charClasses() { 2518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fSets; 2519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBIWordMonkey::~RBBIWordMonkey() { 2523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSets; 2524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fCRSet; 2525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fLFSet; 2526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fNewlineSet; 2527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fKatakanaSet; 252859d709d503bab6e2b61931737e662dd293b40578ccornelius delete fHebrew_LetterSet; 2529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fALetterSet; 253059d709d503bab6e2b61931737e662dd293b40578ccornelius delete fSingle_QuoteSet; 253159d709d503bab6e2b61931737e662dd293b40578ccornelius delete fDouble_QuoteSet; 2532c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fMidNumLetSet; 2533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fMidLetterSet; 2534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fMidNumSet; 2535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fNumericSet; 2536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fFormatSet; 2537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fExtendSet; 2538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fExtendNumLetSet; 253954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius delete fRegionalIndicatorSet; 254054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius delete fDictionaryCjkSet; 2541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fOtherSet; 2542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------ 2548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// class RBBISentMonkey Sentence Break specific implementation 2550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of RBBIMonkeyKind. 2551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------ 2553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RBBISentMonkey: public RBBIMonkeyKind { 2554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 2555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBISentMonkey(); 2556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual ~RBBISentMonkey(); 2557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual UVector *charClasses(); 2558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual void setText(const UnicodeString &s); 2559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual int32_t next(int32_t i); 2560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprivate: 2561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int moveBack(int posFrom); 2562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int moveForward(int posFrom); 2563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 cAt(int pos); 2564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector *fSets; 2566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSepSet; 2568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fFormatSet; 2569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSpSet; 2570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fLowerSet; 2571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fUpperSet; 2572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fOLetterSet; 2573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fNumericSet; 2574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fATermSet; 2575c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *fSContinueSet; 2576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSTermSet; 2577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fCloseSet; 2578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fOtherSet; 2579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fExtendSet; 2580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *fText; 2582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 2584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBISentMonkey::RBBISentMonkey() 2586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 2588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets = new UVector(status); 2590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2591c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2592c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // set and made into character classes of their own. For the monkey impl, 2593c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // they remain in SEP, since Sep always appears with CR and LF in the rules. 2594c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2595c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2596c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2597c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2598c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2599c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2600c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2602c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2603c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2604c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2605c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet = new UnicodeSet(); 2607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(U_FAILURE(status)) { 2609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 2610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->complement(); 2614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fSepSet); 2615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fFormatSet); 2616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fSpSet); 2617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fLowerSet); 2618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fUpperSet); 2619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fOLetterSet); 2620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fNumericSet); 2621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fATermSet); 2622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOtherSet->removeAll(*fSContinueSet); 2623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fSTermSet); 2624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fCloseSet); 2625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fOtherSet->removeAll(*fExtendSet); 2626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2627c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fSepSet, status); 2628c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fFormatSet, status); 2629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fSpSet, status); 2630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fLowerSet, status); 2631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fUpperSet, status); 2632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fOLetterSet, status); 2633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fNumericSet, status); 2634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fATermSet, status); 2635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fSContinueSet, status); 2636c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fSTermSet, status); 2637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fCloseSet, status); 2638c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fOtherSet, status); 2639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSets->addElement(fExtendSet, status); 2640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 2642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 2643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBISentMonkey::setText(const UnicodeString &s) { 2649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fText = &s; 2650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUVector *RBBISentMonkey::charClasses() { 2653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fSets; 2654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// moveBack() Find the "significant" code point preceding the index i. 2658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Skips over ($Extend | $Format)* . 2659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint RBBISentMonkey::moveBack(int i) { 2661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i <= 0) { 2662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return -1; 2663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c; 2665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t j = i; 2666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 2667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru j = fText->moveIndex32(j, -1); 2668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = fText->char32At(j); 2669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return j; 2672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint RBBISentMonkey::moveForward(int i) { 2677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i>=fText->length()) { 2678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fText->length(); 2679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c; 2681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t j = i; 2682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 2683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru j = fText->moveIndex32(j, 1); 2684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = cAt(j); 2685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return j; 2688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUChar32 RBBISentMonkey::cAt(int pos) { 2691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos<0 || pos>=fText->length()) { 2692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return -1; 2693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 2694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fText->char32At(pos); 2695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RBBISentMonkey::next(int32_t prevPos) { 2699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int p0, p1, p2, p3; // Indices of the significant code points around the 2700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // break position being tested. The candidate break 2701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // location is before p2. 2702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int breakPos = -1; 2704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c; 2707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2708c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(deferredStatus)) { 2709c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return -1; 2710c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2711c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Prev break at end of string. return DONE. 2713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevPos >= fText->length()) { 2714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return -1; 2715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p0 = p1 = p2 = p3 = prevPos; 2717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c3 = fText->char32At(prevPos); 2718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c0 = c1 = c2 = 0; 271959d709d503bab6e2b61931737e662dd293b40578ccornelius (void)p0; // Suppress set but not used warning. 2720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop runs once per "significant" character position in the input text. 2722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 2723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Move all of the positions forward in the input string. 2724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p0 = p1; c0 = c1; 2725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p1 = p2; c1 = c2; 2726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p2 = p3; c2 = c3; 2727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Advancd p3 by X(Extend | Format)* Rule 4 2729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p3 = moveForward(p3); 2730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c3 = cAt(p3); 2731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (3) CR x LF 2733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (4). Sep <break> 2738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSepSet->contains(c1)) { 2739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p2 = p1+1; // Separators don't combine with Extend or Format. 2740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p2 >= fText->length()) { 2744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Reached end of string. Always a break position. 2745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (p2 == prevPos) { 2749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Still warming up the loop. (won't work with zero length strings, but we don't care) 2750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (6). ATerm x Numeric 2754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (7). Upper ATerm x Uppper 2759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: STerm | ATerm are added to the negated part of the expression by a 2765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // note to the Unicode 5.0 documents. 2766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int p8 = p1; 2767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fSpSet->contains(cAt(p8))) { 2768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8 = moveBack(p8); 2769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fCloseSet->contains(cAt(p8))) { 2771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8 = moveBack(p8); 2772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fATermSet->contains(cAt(p8))) { 2774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8=p2; 2775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 2776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = cAt(p8); 2777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLowerSet->contains(c) || fSepSet->contains(c) || 2779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fATermSet->contains(c) || fSTermSet->contains(c)) { 2780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8 = moveForward(p8); 2783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fLowerSet->contains(cAt(p8))) { 2785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2789c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2790c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8 = p1; 2792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fSpSet->contains(cAt(p8))) { 2793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8 = moveBack(p8); 2794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fCloseSet->contains(cAt(p8))) { 2796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p8 = moveBack(p8); 2797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = cAt(p8); 2799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2804c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int p9 = p1; 2806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fCloseSet->contains(cAt(p9))) { 2807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p9 = moveBack(p9); 2808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = cAt(p9); 2810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2816c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int p10 = p1; 2818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fSpSet->contains(cAt(p10))) { 2819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p10 = moveBack(p10); 2820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fCloseSet->contains(cAt(p10))) { 2822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p10 = moveBack(p10); 2823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2830c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int p11 = p1; 2832c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fSepSet->contains(cAt(p11))) { 2833c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru p11 = moveBack(p11); 2834c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fSpSet->contains(cAt(p11))) { 2836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p11 = moveBack(p11); 2837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (fCloseSet->contains(cAt(p11))) { 2839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p11 = moveBack(p11); 2840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule (12) Any x Any 2846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 2847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos = p2; 2849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return breakPos; 2850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBISentMonkey::~RBBISentMonkey() { 2853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSets; 2854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSepSet; 2855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fFormatSet; 2856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSpSet; 2857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fLowerSet; 2858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fUpperSet; 2859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fOLetterSet; 2860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fNumericSet; 2861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fATermSet; 2862c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete fSContinueSet; 2863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSTermSet; 2864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCloseSet; 2865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fOtherSet; 2866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fExtendSet; 2867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------- 2872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// RBBILineMonkey 2874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------- 2876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RBBILineMonkey: public RBBIMonkeyKind { 2878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 2879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBILineMonkey(); 2880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual ~RBBILineMonkey(); 2881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual UVector *charClasses(); 2882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual void setText(const UnicodeString &s); 2883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual int32_t next(int32_t i); 2884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprivate: 2886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector *fSets; 2887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fBK; 2889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fCR; 2890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fLF; 2891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fCM; 2892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fNL; 2893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSG; 2894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fWJ; 2895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fZW; 2896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fGL; 2897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fCB; 2898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSP; 2899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fB2; 2900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fBA; 2901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fBB; 2902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fHY; 2903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fH2; 2904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fH3; 2905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fCL; 290650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeSet *fCP; 2907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fEX; 2908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fIN; 2909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fJL; 2910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fJV; 2911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fJT; 2912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fNS; 2913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fOP; 2914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fQU; 2915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fIS; 2916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fNU; 2917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fPO; 2918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fPR; 2919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSY; 2920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fAI; 2921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fAL; 2922103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UnicodeSet *fCJ; 2923103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UnicodeSet *fHL; 2924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fID; 292554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeSet *fRI; 2926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fSA; 2927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *fXX; 2928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 292959d709d503bab6e2b61931737e662dd293b40578ccornelius BreakIterator *fCharBI; 2930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString *fText; 2931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexMatcher *fNumberMatcher; 2932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 2933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBILineMonkey::RBBILineMonkey() 2936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 2938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets = new UVector(status); 2940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2941c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 2942c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 2943c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 2944c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 2945c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 2946c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 2947c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 2948c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 2949c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 2950c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 2951c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 2952c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 2953c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 2954c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 2955c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 2956c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 2957c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 295850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 2959c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 2960c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 2961c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 2962c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 2963c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 2964c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 2965c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 2966c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 2967c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 2968c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 2969c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 2970c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 2971c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 2972c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 2973c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 2974103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 2975103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 2976c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 297754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 2978c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); 2979c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 2980c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 2981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 2983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 2984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharBI = NULL; 2985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumberMatcher = NULL; 2986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fAL->addAll(*fXX); // Default behavior for XX is identical to AL 2990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fAL->addAll(*fAI); // Default behavior for AI is identical to AL 2991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL 2992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 2993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2994103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 2995103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 2996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fBK, status); 2997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fCR, status); 2998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fLF, status); 2999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fCM, status); 3000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fNL, status); 3001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fWJ, status); 3002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fZW, status); 3003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fGL, status); 3004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fCB, status); 3005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fSP, status); 3006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fB2, status); 3007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fBA, status); 3008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fBB, status); 3009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fHY, status); 3010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fH2, status); 3011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fH3, status); 3012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fCL, status); 301350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fSets->addElement(fCP, status); 3014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fEX, status); 3015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fIN, status); 3016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fJL, status); 3017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fJT, status); 3018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fJV, status); 3019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fNS, status); 3020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fOP, status); 3021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fQU, status); 3022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fIS, status); 3023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fNU, status); 3024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fPO, status); 3025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fPR, status); 3026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fSY, status); 3027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fAI, status); 3028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fAL, status); 3029103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fSets->addElement(fHL, status); 3030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fID, status); 3031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fWJ, status); 303254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fSets->addElement(fRI, status); 3033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fSA, status); 3034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSets->addElement(fSG, status); 3035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3036c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru const char *rules = 3037c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" 3038c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" 3039c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "\\p{Line_Break=NU}\\p{Line_Break=CM}*" 3040c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" 304150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" 3042c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; 3043c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumberMatcher = new RegexMatcher( 3045c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString(rules, -1, US_INV), 0, status); 3046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 3050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deferredStatus = status; 3051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBILineMonkey::setText(const UnicodeString &s) { 3056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fText = &s; 3057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharBI->setText(s); 3058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNumberMatcher->reset(s); 3059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// rule9Adjust 3063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Line Break TR rules 9 and 10 implementation. 3064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This deals with combining marks and other sequences that 3065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// that must be treated as if they were something other than what they actually are. 3066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This is factored out into a separate function because it must be applied twice for 3068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// each potential break, once to the chars before the position being checked, then 3069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// again to the text following the possible break. 3070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos == -1) { 3073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Invalid initial position. Happens during the warmup iteration of the 3074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // main loop in next(). 3075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nPos = *nextPos; 3079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 9 Keep combining sequences together. 3081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // advance over any CM class chars. Note that Line Break CM is different 3082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // from the normal Grapheme Extend property. 3083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *nextChar = fText->char32At(nPos); 3087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!fCM->contains(*nextChar)) { 3088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nPos = fText->moveIndex32(nPos, 1); 3091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3095b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 9 Treat X CM* as if it were x. 3096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No explicit action required. 3097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 10 Treat any remaining combining mark as AL 3099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fCM->contains(*posChar)) { 3100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *posChar = 0x41; // thisChar = 'A'; 3101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Push the updated nextPos and nextChar back to our caller. 3104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This only makes a difference if posChar got bigger by consuming a 3105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // combining sequence. 3106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *nextPos = nPos; 3107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *nextChar = fText->char32At(nPos); 3108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RBBILineMonkey::next(int32_t startPos) { 3113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t pos; // Index of the char following a potential break position 3115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 thisChar; // Character at above position "pos" 3116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t prevPos; // Index of the char preceding a potential break position 3118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 prevChar; // Character at above position. Note that prevChar 3119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and thisChar may not be adjacent because combining 3120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // characters between them will be ignored. 3121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3122103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t prevPosX2; // Second previous character. Wider context for LB21a. 3123103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UChar32 prevCharX2; 3124103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 3125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nextPos; // Index of the next character following pos. 3126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Usually skips over combining marks. 3127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nextCPPos; // Index of the code point following "pos." 3128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // May point to a combining mark. 3129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t tPos; // temp value. 3130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c; 3131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(deferredStatus)) { 3133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return -1; 3134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (startPos >= fText->length()) { 3137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return -1; 3138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Initial values for loop. Loop will run the first time without finding breaks, 3142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // while the invalid values shift out and the "this" and 3143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // "prev" positions are filled in with good values. 3144103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 3145103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius thisChar = prevChar = prevCharX2 = 0; 3146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextPos = nextCPPos = startPos; 3147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop runs once per position in the test text, until a break position 3150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is found. 3151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3152103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius prevPosX2 = prevPos; 3153103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius prevCharX2 = prevChar; 3154103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 3155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevPos = pos; 3156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevChar = thisChar; 3157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = nextPos; 3159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru thisChar = fText->char32At(pos); 3160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCPPos = fText->moveIndex32(pos, 1); 3162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextPos = nextCPPos; 3163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule LB2 - Break at end of text. 3165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (pos >= fText->length()) { 3166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Rule LB 9 - adjust for combining sequences. 3170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We do this one out-of-order because the adjustment does not change anything 3171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // be applied. 3173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = fText->char32At(nextPos); 3176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru rule9Adjust(pos, &thisChar, &nextPos, &c); 3177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the loop is still warming up - if we haven't shifted the initial 3179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // -1 positions out of prevPos yet - loop back to advance the 3180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position in the input without any further looking for breaks. 3181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevPos == -1) { 3182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 4 Always break after hard line breaks, 3186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fBK->contains(prevChar)) { 3187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 5 Break after CR, LF, NL, but not inside CR LF 3191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevChar == 0x0d && thisChar == 0x0a) { 3192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (prevChar == 0x0d || 3195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevChar == 0x0a || 3196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prevChar == 0x85) { 3197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 6 Don't break before hard line breaks 3201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fBK->contains(thisChar)) { 3203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 7 Don't break before spaces or zero-width space. 3208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSP->contains(thisChar)) { 3209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fZW->contains(thisChar)) { 3213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 8 Break after zero width space 3217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fZW->contains(prevChar)) { 3218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 9, 10 Already done, at top of loop. 3222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 11 Do not break before or after WORD JOINER and related characters. 3226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // x WJ 3227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // WJ x 3228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 12 3234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // GL x 3235c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fGL->contains(prevChar)) { 3236c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru continue; 3237c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3238c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3239c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // LB 12a 3240c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // [^SP BA HY] x GL 3241c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!(fSP->contains(prevChar) || 3242c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fBA->contains(prevChar) || 3243c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 13 Don't break before closings. 325050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // NU x CL, NU x CP and NU x IS are not matched here so that they will 3251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // fall into LB 17 and the more general number regular expression. 3252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 325327f654740f2a26ad62a5c155af9199af9e69b889claireho if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 325427f654740f2a26ad62a5c155af9199af9e69b889claireho (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 325527f654740f2a26ad62a5c155af9199af9e69b889claireho fEX->contains(thisChar) || 325627f654740f2a26ad62a5c155af9199af9e69b889claireho (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 325727f654740f2a26ad62a5c155af9199af9e69b889claireho (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 14 Don't break after OP SP* 3262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan backwards, checking for this sequence. 3263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The OP char could include combining marks, so we actually check for 3264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // OP CM* SP* 3265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Another Twist: The Rule 67 fixes may have changed a SP CM 3266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sequence into a ID char, so before scanning back through spaces, 3267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // verify that prevChar is indeed a space. The prevChar variable 3268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // may differ from fText[prevPos] 3269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos = prevPos; 3270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSP->contains(prevChar)) { 3271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos=fText->moveIndex32(tPos, -1); 3273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos=fText->moveIndex32(tPos, -1); 3277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fOP->contains(fText->char32At(tPos))) { 3279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 15 QU SP* x OP 3284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fOP->contains(thisChar)) { 3285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int tPos = prevPos; 3287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos = fText->moveIndex32(tPos, -1); 3289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos = fText->moveIndex32(tPos, -1); 3292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fQU->contains(fText->char32At(tPos))) { 3294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 330050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // LB 16 (CL | CP) SP* x NS 330150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Scan backwards for SP* CM* (CL | CP) 3302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fNS->contains(thisChar)) { 3303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int tPos = prevPos; 3304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos = fText->moveIndex32(tPos, -1); 3306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos = fText->moveIndex32(tPos, -1); 3309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 331050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 17 B2 SP* x B2 3317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fB2->contains(thisChar)) { 3318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan backwards, checking for the B2 CM* SP* sequence. 3319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos = prevPos; 3320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSP->contains(prevChar)) { 3321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos=fText->moveIndex32(tPos, -1); 3323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tPos=fText->moveIndex32(tPos, -1); 3327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fB2->contains(fText->char32At(tPos))) { 3329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 18 break after space 3335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSP->contains(prevChar)) { 3336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 19 3340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // x QU 3341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // QU x 3342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 20 Break around a CB 3347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 21 3352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fBA->contains(thisChar) || 3353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fHY->contains(thisChar) || 3354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNS->contains(thisChar) || 3355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fBB->contains(prevChar) ) { 3356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3359103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // LB 21a 3360103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // HL (HY | BA) x 3361103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fHL->contains(prevCharX2) && 3362103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3363103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius continue; 3364103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 3365103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 336659d709d503bab6e2b61931737e662dd293b40578ccornelius // LB 21b 336759d709d503bab6e2b61931737e662dd293b40578ccornelius // SY x HL 336859d709d503bab6e2b61931737e662dd293b40578ccornelius if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 336959d709d503bab6e2b61931737e662dd293b40578ccornelius continue; 337059d709d503bab6e2b61931737e662dd293b40578ccornelius } 337159d709d503bab6e2b61931737e662dd293b40578ccornelius 3372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 22 337327f654740f2a26ad62a5c155af9199af9e69b889claireho if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3374103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fHL->contains(prevChar) && fIN->contains(thisChar)) || 337527f654740f2a26ad62a5c155af9199af9e69b889claireho (fID->contains(prevChar) && fIN->contains(thisChar)) || 337627f654740f2a26ad62a5c155af9199af9e69b889claireho (fIN->contains(prevChar) && fIN->contains(thisChar)) || 337727f654740f2a26ad62a5c155af9199af9e69b889claireho (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 23 ID x PO 3383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // AL x NU 3384103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // HL x NU 3385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NU x AL 338627f654740f2a26ad62a5c155af9199af9e69b889claireho if ((fID->contains(prevChar) && fPO->contains(thisChar)) || 338727f654740f2a26ad62a5c155af9199af9e69b889claireho (fAL->contains(prevChar) && fNU->contains(thisChar)) || 3388103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fHL->contains(prevChar) && fNU->contains(thisChar)) || 3389103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fNU->contains(prevChar) && fAL->contains(thisChar)) || 3390103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fNU->contains(prevChar) && fHL->contains(thisChar)) ) { 3391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 24 Do not break between prefix and letters or ideographs. 3395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // PR x ID 3396103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // PR x (AL | HL) 3397103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // PO x (AL | HL) 339827f654740f2a26ad62a5c155af9199af9e69b889claireho if ((fPR->contains(prevChar) && fID->contains(thisChar)) || 3399103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) || 3400103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) { 3401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 25 Numbers 3407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fNumberMatcher->lookingAt(prevPos, status)) { 3408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 3409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Matched a number. But could have been just a single digit, which would 3412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not represent a "no break here" between prevChar and thisChar 3413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (numEndIdx > pos) { 3415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Number match includes at least our two chars being checked 3416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (numEndIdx > nextPos) { 3417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Number match includes additional chars. Update pos and nextPos 3418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // so that next loop iteration will continue at the end of the number, 3419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // checking for breaks between last char in number & whatever follows. 3420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = nextPos = numEndIdx; 3421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 3422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = fText->moveIndex32(pos, -1); 3423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru thisChar = fText->char32At(pos); 3424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } while (fCM->contains(thisChar)); 3425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 26 Do not break a Korean syllable. 3432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fJV->contains(thisChar) || 3434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fH2->contains(thisChar) || 3435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fH3->contains(thisChar))) { 3436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fJT->contains(thisChar)) { 3446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 27 Treat a Korean Syllable Block the same as ID. 3450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIN->contains(thisChar)) { 3453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPO->contains(thisChar)) { 3458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3467c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // LB 28 Do not break between alphabetics ("at"). 3468103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3473103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 347750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 347850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // (AL | NU) x OP 347950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // CP x (AL | NU) 3480103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 348150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 348250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 3483103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 348450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 348550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 348650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 348754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // LB30a Do not break between regional indicators. 348854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // RI x RI 348954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 349054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius continue; 349154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 349254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 3493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LB 31 Break everywhere else 3494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return pos; 3499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUVector *RBBILineMonkey::charClasses() { 3503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fSets; 3504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRBBILineMonkey::~RBBILineMonkey() { 3508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSets; 3509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fBK; 3511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCR; 3512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fLF; 3513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCM; 3514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fNL; 3515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fWJ; 3516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fZW; 3517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fGL; 3518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCB; 3519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSP; 3520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fB2; 3521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fBA; 3522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fBB; 3523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fHY; 3524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fH2; 3525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fH3; 3526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCL; 352750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho delete fCP; 3528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fEX; 3529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fIN; 3530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fJL; 3531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fJV; 3532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fJT; 3533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fNS; 3534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fOP; 3535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fQU; 3536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fIS; 3537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fNU; 3538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fPO; 3539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fPR; 3540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSY; 3541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fAI; 3542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fAL; 3543103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius delete fCJ; 3544103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius delete fHL; 3545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fID; 354654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius delete fRI; 3547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSA; 3548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fSG; 3549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fXX; 3550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fCharBI; 3552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete fNumberMatcher; 3553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------- 3557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TestMonkey 3559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// params 3561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// seed=nnnnn Random number starting seed. 3562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Setting the seed allows errors to be reproduced. 3563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// loop=nnn Looping count. Controls running time. 3564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// -1: run forever. 3565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 0 or greater: run length. 3566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// type = char | word | line | sent | title 3568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------------------- 3570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t val = defaultVal; 3573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name.append(" *= *(-?\\d+)"); 3574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexMatcher m(name, params, 0, status); 3576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (m.find()) { 3577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The param exists. Convert the string to an int. 3578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char valString[100]; 3579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t paramLength = m.end(1, status) - m.start(1, status); 3580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru paramLength = (int32_t)(sizeof(valString)-2); 3582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru val = strtol(valString, NULL, 10); 3585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Delete this parameter from the params string. 3587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru m.reset(); 3588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru params = m.replaceFirst("", status); 3589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(U_SUCCESS(status)); 3591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return val; 3592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 3594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 359554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi, 3598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[], 3599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expectedcount) 3600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int count = 0; 3602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i = 0; 3603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int forward[50]; 3604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(ustr); 3605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forward[count] = i; 3607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count < expectedcount && expected[count] != i) { 3608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("break forward test failed: expected %d but got %d", 3609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expected[count], i); 3610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 3613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count != expectedcount) { 3615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, expected, expectedcount); 3616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("break forward test failed: missed %d match", 3617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedcount - count); 3618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // testing boundaries 3621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 1; i < expectedcount; i ++) { 3622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int j = expected[i - 1]; 3623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!bi->isBoundary(j)) { 3624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, expected, expectedcount); 3625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("isBoundary() failed. Expected boundary at position %d", j); 3626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bi->isBoundary(j)) { 3630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, expected, expectedcount); 3631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 3639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forward[count] != i) { 364054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius printStringBreaks(ustr, expected, expectedcount); 3641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("happy break test previous() failed: expected %d but got %d", 3642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forward[count], i); 3643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count != 0) { 3647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, expected, expectedcount); 3648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("break test previous() failed: missed a match"); 3649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // testing preceding 3653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i < expectedcount - 1; i ++) { 3654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // int j = expected[i] + 1; 3655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int j = ustr.moveIndex32(expected[i], 1); 3656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (; j <= expected[i + 1]; j ++) { 3657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bi->preceding(j) != expected[i]) { 3658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, expected, expectedcount); 3659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru test->errln("preceding(): Not expecting boundary at position %d", j); 3660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 366554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif 3666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestWordBreaks(void) 3668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Locale locale("en"); 3672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 367554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Replaced any C+J characters in a row with a random sequence of characters 367654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // of the same length to make our C+J segmentation not get in the way. 3677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const char *strlist[] = 3678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 368054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3681c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 368354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\uac00\\u3588\\u009c\\u0953\\u194b", 3684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 368654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2027\\U000e0067\\u0a47\\u00b7", 3690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0589\\U000e006e\\u0a42\\U000104a5", 369354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0027\\u11af\\U000e0057\\u0602", 3696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001d7f2\\U000e007\\u0004\\u0589", 3697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3700c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0233\\U000e0020\\u0a69\\u0d6a", 3704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 370554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\u18f4\\U000e0049\\u20e7\\u2027", 3706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ua183\\u102d\\u0bec\\u003a", 3708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003a\\u0e57\\u0fad\\u002e", 3710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003a\\u0664\\u00b7\\u1fba", 3714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003b\\u0027\\u00b7\\u47a3", 371554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 3719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int loop; 3720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 37216d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // printf("looping %d\n", loop); 3726c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // RBBICharMonkey monkey; 3728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBIWordMonkey monkey; 3729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[50]; 3731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expectedcount = 0; 3732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru monkey.setText(ustr); 3734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 3735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expected[expectedcount ++] = i; 3737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 3742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 3743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestWordBoundary(void) 3746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Locale locale("en"); 3749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar str[50]; 3753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const char *strlist[] = 3754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2027\\U000e0067\\u0a47\\u00b7", 3759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0589\\U000e006e\\u0a42\\U000104a5", 3762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0027\\u11af\\U000e0057\\u0602", 3765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001d7f2\\U000e007\\u0004\\u0589", 3766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 377054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\U000e0065\\u302c\\u09ee\\U000e0068", 3771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0233\\U000e0020\\u0a69\\u0d6a", 3773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u58f4\\U000e0049\\u20e7\\u2027", 377554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ua183\\u102d\\u0bec\\u003a", 3777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003a\\u0e57\\u0fad\\u002e", 3779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003a\\u0664\\u00b7\\u1fba", 3783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u003b\\u0027\\u00b7\\u47a3", 3784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 3785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int loop; 3786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 37876d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // printf("looping %d\n", loop); 3792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_unescape(strlist[loop], str, 20); 3793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ustr(str); 3794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int forward[50]; 3795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int count = 0; 3796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(ustr); 3798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int prev = 0; 3799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 3800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forward[count ++] = i; 3802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i > prev) { 3803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int j; 3804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (j = prev + 1; j < i; j ++) { 3805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (bi->isBoundary(j)) { 3806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, forward, count); 3807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("happy boundary test failed: expected %d not a boundary", 3808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru j); 3809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!bi->isBoundary(i)) { 3814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(ustr, forward, count); 3815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("happy boundary test failed: expected %d a boundary", 3816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru i); 3817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru prev = i; 3820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 3823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestLineBreaks(void) 3826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Locale locale("en"); 3829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const int32_t STRSIZE = 50; 3832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar str[STRSIZE]; 3833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const char *strlist[] = 3834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", 3846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 3856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 3857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 3858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 3859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 3860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 3861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", 3862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 3863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 3864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 3865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 3866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 3867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", 3868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 3869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 3870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2014\\u0020\\u000a\\u17c5\\u24fc", 3871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 3872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 3873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 3874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 3875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 3876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 3877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" 3878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" 3879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", 3880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 3881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 3882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 3883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int loop; 3884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 3885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 3886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // printf("looping %d\n", loop); 3890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t t = u_unescape(strlist[loop], str, STRSIZE); 3891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (t >= STRSIZE) { 3892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(FALSE); 3893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ustr(str); 3898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBILineMonkey monkey; 3899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(monkey.deferredStatus)) { 3900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const int EXPECTEDSIZE = 50; 3904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[EXPECTEDSIZE]; 3905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expectedcount = 0; 3906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru monkey.setText(ustr); 3908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 3909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedcount >= EXPECTEDSIZE) { 3911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expected[expectedcount ++] = i; 3915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 3920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 3921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestSentBreaks(void) 3924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 3925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Locale locale("en"); 3927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 3929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar str[200]; 3930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static const char *strlist[] = 3931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Now\ris\nthe\r\ntime\n\rfor\r\r", 3933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "This\n", 3934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 3935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\"Sentence ending with a quote.\" Bye.", 3936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru " (This is it). Testing the sentence iterator. \"This isn't it.\"", 3937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 3938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 3939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 3940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 3941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 3942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 3943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 3944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 3945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 3946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 3947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 3948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 3949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 3950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 3951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 3952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 3953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int loop; 3954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(status)) { 39556d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 3959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); 3960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString ustr(str); 3961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBISentMonkey monkey; 3963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(monkey.deferredStatus)) { 3964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru continue; 3965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const int EXPECTEDSIZE = 50; 3968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[EXPECTEDSIZE]; 3969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expectedcount = 0; 3970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru monkey.setText(ustr); 3972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 3973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedcount >= EXPECTEDSIZE) { 3975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT(expectedcount < EXPECTEDSIZE); 3976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expected[expectedcount ++] = i; 3979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 3984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 3985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestMonkey(char *params) { 3988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 3989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 3991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopCount = 500; 3992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t seed = 1; 3993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString breakType = "all"; 3994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Locale locale("en"); 3995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool useUText = FALSE; 3996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (quick == FALSE) { 3998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopCount = 10000; 3999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (params) { 4002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString p(params); 4003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopCount = getIntParam("loop", p, loopCount); 4004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru seed = getIntParam("seed", p, seed); 4005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (m.find()) { 4008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakType = m.group(1, status); 4009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru m.reset(); 4010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p = m.replaceFirst("", status); 4011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexMatcher u(" *utext", p, 0, status); 4014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u.find()) { 4015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru useUText = TRUE; 4016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u.reset(); 4017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p = u.replaceFirst("", status); 4018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // m.reset(p); 4022c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Each option is stripped out of the option string as it is processed. 4024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // All options have been checked. The option string should have been completely emptied.. 4025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char buf[100]; 4026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru p.extract(buf, sizeof(buf), NULL, status); 4027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru buf[sizeof(buf)-1] = 0; 4028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Unrecognized or extra parameter: %s\n", buf); 4029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 4030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakType == "char" || breakType == "all") { 4035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBICharMonkey m; 4036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 4038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RunMonkey(bi, m, "char", seed, loopCount, useUText); 4039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakType == "all" && useUText==FALSE) { 4040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Also run a quick test with UText when "all" is specified 4041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 40456d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 4048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakType == "word" || breakType == "all") { 4051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("Word Break Monkey Test"); 4052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBIWordMonkey m; 4053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 4055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RunMonkey(bi, m, "word", seed, loopCount, useUText); 4056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 40586d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 4061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakType == "line" || breakType == "all") { 4064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("Line Break Monkey Test"); 4065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBILineMonkey m; 4066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (loopCount >= 10) { 4068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopCount = loopCount / 5; // Line break runs slower than the others. 4069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 4071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RunMonkey(bi, m, "line", seed, loopCount, useUText); 4072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 40746d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 4077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakType == "sent" || breakType == "all" ) { 4080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru logln("Sentence Break Monkey Test"); 4081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RBBISentMonkey m; 4082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (loopCount >= 10) { 4084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopCount = loopCount / 10; // Sentence runs slower than the other break types 4085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status)) { 4087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 40906d5deb12725f146643d443090dfa11b206df528aJean-Baptiste Queru errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete bi; 4093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4095b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 4096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Run a RBBI monkey test. Common routine, for all break iterator types. 4100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Parameters: 4101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// bi - the break iterator to use 4102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// mk - MonkeyKind, abstraction for obtaining expected results 4103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// name - Name of test (char, word, etc.) for use in error messages 4104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// seed - Seed for starting random number generator (parameter from user) 4105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// numIterations 4106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numIterations, UBool useUText) { 4109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 4111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const int32_t TESTSTRINGLEN = 500; 4113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString testText; 4114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numCharClasses; 4115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector *chClasses; 4116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expected[TESTSTRINGLEN*2 + 1]; 4117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int expectedCount = 0; 4118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char reverseBreaks[TESTSTRINGLEN*2+1]; 4121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char followingBreaks[TESTSTRINGLEN*2+1]; 4123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char precedingBreaks[TESTSTRINGLEN*2+1]; 4124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int i; 4125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int loopCount = 0; 4126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru m_seed = seed; 4128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numCharClasses = mk.charClasses()->size(); 4130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru chClasses = mk.charClasses(); 4131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for errors that occured during the construction of the MonkeyKind object. 4133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Can't report them where they occured because errln() is a method coming from intlTest, 4134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and is not visible outside of RBBITest :-( 4135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(mk.deferredStatus)) { 4136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 4138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Verify that the character classes all have at least one member. 4141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<numCharClasses; i++) { 4142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (s == NULL || s->size() == 0) { 4144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("Character Class #%d is null or of zero size.", i); 4145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 4146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru while (loopCount < numIterations || numIterations == -1) { 4150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (numIterations == -1 && loopCount % 10 == 0) { 4151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If test is running in an infinite loop, display a periodic tic so 4152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we can tell that it is making progress. 4153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fprintf(stderr, "."); 4154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save current random number seed, so that we can recreate the random numbers 4156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for this loop iteration in event of an error. 4157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru seed = m_seed; 4158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Populate a test string with data. 4160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testText.truncate(0); 4161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<TESTSTRINGLEN; i++) { 4162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t aClassNum = m_rand() % numCharClasses; 4163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t charIdx = m_rand() % classSet->size(); 4165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = classSet->charAt(charIdx); 4166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c < 0) { // TODO: deal with sets containing strings. 4167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("c < 0"); 4168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru testText.append(c); 4171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Calculate the expected results for this test string. 4174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru mk.setText(testText); 4175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedBreaks[0] = 1; 4177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t breakPos = 0; 4178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedCount = 0; 4179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos = mk.next(breakPos); 4181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakPos == -1) { 4182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakPos > testText.length()) { 4185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("breakPos > testText.length()"); 4186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expectedBreaks[breakPos] = 1; 4188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(expectedCount<testText.length()); 4189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru expected[expectedCount ++] = breakPos; 419059d709d503bab6e2b61931737e662dd293b40578ccornelius (void)expected; // Set but not used warning. 419159d709d503bab6e2b61931737e662dd293b40578ccornelius // TODO (andy): check it out. 4192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the break positions using forward iteration 4195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (useUText) { 4197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 4198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // testUText = utext_openUnicodeString(testUText, &testText, &status); 4200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(testUText, status); 4201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru TEST_ASSERT_SUCCESS(status); 4202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru utext_close(testUText); // The break iterator does a shallow clone of the UText 4203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This UText can be closed immediately, so long as the 4204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // testText string continues to exist. 4205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 4206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(testText); 4207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i < 0 || i > testText.length()) { 4211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardBreaks[i] = 1; 4215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the break positions using reverse iteration 4218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i < 0 || i > testText.length()) { 4221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru reverseBreaks[i] = 1; 4225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the break positions using isBoundary() tests. 4228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<=testText.length(); i++) { 4231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru isBoundaryBreaks[i] = bi->isBoundary(i); 4232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the break positions using the following() function. 4236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // printf("."); 4237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru memset(followingBreaks, 0, sizeof(followingBreaks)); 4238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t lastBreakPos = 0; 4239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru followingBreaks[0] = 1; 4240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<testText.length(); i++) { 4241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos = bi->following(i); 4242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakPos <= i || 4243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos < lastBreakPos || 4244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos > testText.length() || 424527f654740f2a26ad62a5c155af9199af9e69b889claireho (breakPos > lastBreakPos && lastBreakPos > i)) { 4246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("%s break monkey test: " 4247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Out of range value returned by BreakIterator::following().\n" 4248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name, seed, i, breakPos, lastBreakPos); 4250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru followingBreaks[breakPos] = 1; 4253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastBreakPos = breakPos; 4254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the break positions using the preceding() function. 4257c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastBreakPos = testText.length(); 4259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru precedingBreaks[testText.length()] = 1; 4260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=testText.length(); i>0; i--) { 4261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos = bi->preceding(i); 4262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (breakPos >= i || 4263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru breakPos > lastBreakPos || 426427f654740f2a26ad62a5c155af9199af9e69b889claireho (breakPos < 0 && testText.getChar32Start(i)>0) || 426527f654740f2a26ad62a5c155af9199af9e69b889claireho (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errln("%s break monkey test: " 4267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "Out of range value returned by BreakIterator::preceding().\n" 4268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru "index=%d; prev returned %d; lastBreak=%d" , 4269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru name, i, breakPos, lastBreakPos); 4270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru precedingBreaks[i] = 2; // Forces an error. 4272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 4274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (breakPos >= 0) { 4275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru precedingBreaks[breakPos] = 1; 4276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastBreakPos = breakPos; 4278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compare the expected and actual results. 4282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<=testText.length(); i++) { 4283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const char *errorType = NULL; 4284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardBreaks[i] != expectedBreaks[i]) { 4285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorType = "next()"; 4286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (reverseBreaks[i] != forwardBreaks[i]) { 4287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorType = "previous()"; 4288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorType = "isBoundary()"; 4290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (followingBreaks[i] != expectedBreaks[i]) { 4291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorType = "following()"; 4292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (precedingBreaks[i] != expectedBreaks[i]) { 4293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorType = "preceding()"; 4294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (errorType != NULL) { 4298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Format a range of the test text that includes the failure as 4299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a data item that can be included in the rbbi test data file. 4300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Start of the range is the last point where expected and actual results 4302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // both agreed that there was a break position. 4303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int startContext = i; 4304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t count = 0; 4305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (startContext==0) { break; } 4307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru startContext --; 4308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedBreaks[startContext] != 0) { 4309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count == 2) break; 4310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count ++; 4311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // End of range is two expected breaks past the start position. 4315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int endContext = i + 1; 4316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ci; 4317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (endContext >= testText.length()) {break;} 4320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (expectedBreaks[endContext-1] != 0) { 4321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (count == 0) break; 4322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru count --; 4323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru endContext ++; 4325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString errorText = "<data>"; 4330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /***if (strcmp(errorType, "next()") == 0) { 4331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru startContext = 0; 4332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru endContext = testText.length(); 4333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printStringBreaks(testText, expected, expectedCount); 4335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }***/ 4336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (ci=startContext; ci<endContext;) { 4338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString hexChars("0123456789abcdef"); 4339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c; 4340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int bn; 4341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = testText.char32At(ci); 4342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ci == i) { 4343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This is the location of the error. 4344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append("<?>"); 4345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (expectedBreaks[ci] != 0) { 4346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This a non-error expected break position. 4347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append("\\"); 4348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c < 0x10000) { 4350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append("\\u"); 4351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (bn=12; bn>=0; bn-=4) { 4352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append(hexChars.charAt((c>>bn)&0xf)); 4353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 4355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append("\\U"); 4356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (bn=28; bn>=0; bn-=4) { 4357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append(hexChars.charAt((c>>bn)&0xf)); 4358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ci = testText.moveIndex32(ci, 1); 4361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append("\\"); 4363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.append("</data>\n"); 4364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Output the error 4366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru char charErrorTxt[500]; 4367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 4368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4370103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4371103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 4372103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4373103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru errorType, seed, i, charErrorTxt); 4375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopCount++; 4380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 4382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 438427f654740f2a26ad62a5c155af9199af9e69b889claireho 438527f654740f2a26ad62a5c155af9199af9e69b889claireho// Bug 5532. UTF-8 based UText fails in dictionary code. 438627f654740f2a26ad62a5c155af9199af9e69b889claireho// This test checks the initial patch, 438727f654740f2a26ad62a5c155af9199af9e69b889claireho// which is to just keep it from crashing. Correct word boundaries 438827f654740f2a26ad62a5c155af9199af9e69b889claireho// await a proper fix to the dictionary code. 438927f654740f2a26ad62a5c155af9199af9e69b889claireho// 439027f654740f2a26ad62a5c155af9199af9e69b889clairehovoid RBBITest::TestBug5532(void) { 439127f654740f2a26ad62a5c155af9199af9e69b889claireho // Text includes a mixture of Thai and Latin. 439227f654740f2a26ad62a5c155af9199af9e69b889claireho const unsigned char utf8Data[] = { 439327f654740f2a26ad62a5c155af9199af9e69b889claireho 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 439427f654740f2a26ad62a5c155af9199af9e69b889claireho 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 439527f654740f2a26ad62a5c155af9199af9e69b889claireho 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 439627f654740f2a26ad62a5c155af9199af9e69b889claireho 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 439727f654740f2a26ad62a5c155af9199af9e69b889claireho 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 439827f654740f2a26ad62a5c155af9199af9e69b889claireho 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 439927f654740f2a26ad62a5c155af9199af9e69b889claireho 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 440027f654740f2a26ad62a5c155af9199af9e69b889claireho 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 440127f654740f2a26ad62a5c155af9199af9e69b889claireho 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 440227f654740f2a26ad62a5c155af9199af9e69b889claireho 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 440327f654740f2a26ad62a5c155af9199af9e69b889claireho 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 440427f654740f2a26ad62a5c155af9199af9e69b889claireho 440527f654740f2a26ad62a5c155af9199af9e69b889claireho UErrorCode status = U_ZERO_ERROR; 440627f654740f2a26ad62a5c155af9199af9e69b889claireho UText utext=UTEXT_INITIALIZER; 440727f654740f2a26ad62a5c155af9199af9e69b889claireho utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 440827f654740f2a26ad62a5c155af9199af9e69b889claireho TEST_ASSERT_SUCCESS(status); 440927f654740f2a26ad62a5c155af9199af9e69b889claireho 441027f654740f2a26ad62a5c155af9199af9e69b889claireho BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 441127f654740f2a26ad62a5c155af9199af9e69b889claireho TEST_ASSERT_SUCCESS(status); 441227f654740f2a26ad62a5c155af9199af9e69b889claireho if (U_SUCCESS(status)) { 441327f654740f2a26ad62a5c155af9199af9e69b889claireho bi->setText(&utext, status); 441427f654740f2a26ad62a5c155af9199af9e69b889claireho TEST_ASSERT_SUCCESS(status); 441527f654740f2a26ad62a5c155af9199af9e69b889claireho 441627f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t breakCount = 0; 441727f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t previousBreak = -1; 441827f654740f2a26ad62a5c155af9199af9e69b889claireho for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 441927f654740f2a26ad62a5c155af9199af9e69b889claireho // For now, just make sure that the break iterator doesn't hang. 442027f654740f2a26ad62a5c155af9199af9e69b889claireho TEST_ASSERT(previousBreak < bi->current()); 442127f654740f2a26ad62a5c155af9199af9e69b889claireho previousBreak = bi->current(); 442227f654740f2a26ad62a5c155af9199af9e69b889claireho } 442327f654740f2a26ad62a5c155af9199af9e69b889claireho TEST_ASSERT(breakCount > 0); 442427f654740f2a26ad62a5c155af9199af9e69b889claireho } 442527f654740f2a26ad62a5c155af9199af9e69b889claireho delete bi; 442627f654740f2a26ad62a5c155af9199af9e69b889claireho utext_close(&utext); 442727f654740f2a26ad62a5c155af9199af9e69b889claireho} 442827f654740f2a26ad62a5c155af9199af9e69b889claireho 442927f654740f2a26ad62a5c155af9199af9e69b889claireho 44308393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Corneliusvoid RBBITest::TestBug9983(void) { 44318393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius UnicodeString text = UnicodeString("\\u002A" // * Other 44328393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius "\\uFF65" // Other 44338393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius "\\u309C" // Katakana 44348393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius "\\uFF9F" // Extend 44358393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius "\\uFF65" // Other 44368393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius "\\u0020" // Other 44378393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius "\\u0000").unescape(); 44388393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 44398393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius UErrorCode status = U_ZERO_ERROR; 44408393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 44418393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius BreakIterator::createWordInstance(Locale::getRoot(), status))); 44428393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius TEST_ASSERT_SUCCESS(status); 444359d709d503bab6e2b61931737e662dd293b40578ccornelius LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 444459d709d503bab6e2b61931737e662dd293b40578ccornelius BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 444559d709d503bab6e2b61931737e662dd293b40578ccornelius TEST_ASSERT_SUCCESS(status); 44468393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if (U_FAILURE(status)) { 44478393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius return; 44488393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 444959d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t offset, rstatus, iterationCount; 445059d709d503bab6e2b61931737e662dd293b40578ccornelius 44518393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius brkiter->setText(text); 44528393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius brkiter->last(); 445359d709d503bab6e2b61931737e662dd293b40578ccornelius iterationCount = 0; 44548393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius while ( (offset = brkiter->previous()) != UBRK_DONE ) { 44558393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius iterationCount++; 44568393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius rstatus = brkiter->getRuleStatus(); 445759d709d503bab6e2b61931737e662dd293b40578ccornelius (void)rstatus; // Suppress set but not used warning. 445859d709d503bab6e2b61931737e662dd293b40578ccornelius if (iterationCount >= 10) { 445959d709d503bab6e2b61931737e662dd293b40578ccornelius break; 446059d709d503bab6e2b61931737e662dd293b40578ccornelius } 446159d709d503bab6e2b61931737e662dd293b40578ccornelius } 446259d709d503bab6e2b61931737e662dd293b40578ccornelius TEST_ASSERT(iterationCount == 6); 446359d709d503bab6e2b61931737e662dd293b40578ccornelius 446459d709d503bab6e2b61931737e662dd293b40578ccornelius brkiterPOSIX->setText(text); 446559d709d503bab6e2b61931737e662dd293b40578ccornelius brkiterPOSIX->last(); 446659d709d503bab6e2b61931737e662dd293b40578ccornelius iterationCount = 0; 446759d709d503bab6e2b61931737e662dd293b40578ccornelius while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 446859d709d503bab6e2b61931737e662dd293b40578ccornelius iterationCount++; 446959d709d503bab6e2b61931737e662dd293b40578ccornelius rstatus = brkiterPOSIX->getRuleStatus(); 447059d709d503bab6e2b61931737e662dd293b40578ccornelius (void)rstatus; // Suppress set but not used warning. 44718393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if (iterationCount >= 10) { 44728393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius break; 44738393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 44748393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 44758393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius TEST_ASSERT(iterationCount == 6); 44768393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius} 44778393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 44788393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 4479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// TestDebug - A place-holder test for debugging purposes. 4481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// For putting in fragments of other tests that can be invoked 4482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for tracing without a lot of unwanted extra stuff happening. 4483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RBBITest::TestDebug(void) { 4485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if 0 4486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 4487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int pos = 0; 4488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int ruleStatus = 0; 4489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RuleBasedBreakIterator* bi = 4491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 4492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); 4493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); 4494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); 4495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // UnicodeString s("Aaa. Bcd"); 4496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s = s.unescape(); 4497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru bi->setText(s); 4498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool r = bi->isBoundary(8); 4499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("%s", r?"true":"false"); 4500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 4501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->last(); 4502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru do { 4503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ruleStatus = bi->getRuleStatus(); 4504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru printf("%d\t%d\n", pos, ruleStatus); 4505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru pos = bi->previous(); 4506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } while (pos != BreakIterator::DONE); 4507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif 4508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4510103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliusvoid RBBITest::TestProperties() { 4511103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UErrorCode errorCode = U_ZERO_ERROR; 4512103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4513103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (!prependSet.isEmpty()) { 4514103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius errln( 4515103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius "[:GCB=Prepend:] is not empty any more. " 4516103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius "Uncomment relevant lines in source/data/brkitr/char.txt and " 4517103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius "change this test to the opposite condition."); 4518103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 4519103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius} 4520103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 4521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4522