dictbe.h revision f9878a236aa0d9662d8e40cafdaf2e04cd615835
1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 3f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Copyright (C) 2006-2014, International Business Machines Corporation * 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and others. All Rights Reserved. * 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef DICTBE_H 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DICTBE_H 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utext.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "brkeng.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass DictionaryMatcher; 20f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusclass Normalizer2; 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/******************************************************************* 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * DictionaryBreakEngine 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * dictionary to determine language-specific breaks.</p> 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>After it is constructed a DictionaryBreakEngine may be shared between 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * threads without synchronization.</p> 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass DictionaryBreakEngine : public LanguageBreakEngine { 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru private: 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The set of characters handled by this engine 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @internal 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fSet; 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The set of break types handled by this engine 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @internal 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t fTypes; 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Default constructor.</p> 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DictionaryBreakEngine(); 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public: 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Constructor setting the break types handled.</p> 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param breakTypes A bitmap of types handled by the engine. 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru DictionaryBreakEngine( uint32_t breakTypes ); 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Virtual destructor.</p> 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~DictionaryBreakEngine(); 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 6954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius /** 7054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * <p>Indicate whether this engine handles a particular character for 7154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * a particular kind of break.</p> 7254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 7354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param c A character which begins a run that the engine might handle 7454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param breakType The type of text break which the caller wants to determine 7554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @return TRUE if this engine handles the particular character and break 7654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * type. 7754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual UBool handles( UChar32 c, int32_t breakType ) const; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius /** 8154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * <p>Find any breaks within a run in the supplied text.</p> 8254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 8354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param text A UText representing the text. The iterator is left at 8454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * the end of the run of characters which the engine is capable of handling 8554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * that starts from the first (or last) character in the range. 8654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param startPos The start of the run within the supplied text. 8754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param endPos The end of the run within the supplied text. 8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param reverse Whether the caller is looking for breaks in a reverse 8954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * direction. 9054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param breakType The type of break desired, or -1. 9154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param foundBreaks An allocated C array of the breaks found, if any 9254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @return The number of breaks found. 9354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual int32_t findBreaks( UText *text, 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t startPos, 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t endPos, 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool reverse, 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t breakType, 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const; 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru protected: 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Set the character set handled by this engine.</p> 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param set A UnicodeSet of the set of characters handled by the engine 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual void setCharacters( const UnicodeSet &set ); 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Set the break types handled by this engine.</p> 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param breakTypes A bitmap of types handled by the engine. 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// virtual void setBreakTypes( uint32_t breakTypes ); 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 11854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param text A UText representing the text 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param rangeStart The start of the range of dictionary characters 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param rangeEnd The end of the range of dictionary characters 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param foundBreaks Output of C array of int32_t break positions, or 0 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return The number of breaks found 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual int32_t divideUpDictionaryRange( UText *text, 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeStart, 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeEnd, 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const = 0; 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/******************************************************************* 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ThaiBreakEngine 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 13954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * dictionary and heuristics to determine Thai-specific breaks.</p> 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>After it is constructed a ThaiBreakEngine may be shared between 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * threads without synchronization.</p> 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass ThaiBreakEngine : public DictionaryBreakEngine { 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru private: 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The set of characters handled by this engine 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @internal 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fThaiWordSet; 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fEndWordSet; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fBeginWordSet; 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fSuffixSet; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fMarkSet; 15654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius DictionaryMatcher *fDictionary; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public: 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Default constructor.</p> 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 16354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * engine is deleted. 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 16654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>Virtual destructor.</p> 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~ThaiBreakEngine(); 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru protected: 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 17554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param text A UText representing the text 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param rangeStart The start of the range of dictionary characters 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param rangeEnd The end of the range of dictionary characters 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param foundBreaks Output of C array of int32_t break positions, or 0 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return The number of breaks found 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual int32_t divideUpDictionaryRange( UText *text, 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeStart, 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeEnd, 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const; 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 19054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/******************************************************************* 19159d709d503bab6e2b61931737e662dd293b40578ccornelius * LaoBreakEngine 19254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 19354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 19454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/** 19559d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 19659d709d503bab6e2b61931737e662dd293b40578ccornelius * dictionary and heuristics to determine Lao-specific breaks.</p> 19759d709d503bab6e2b61931737e662dd293b40578ccornelius * 19859d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>After it is constructed a LaoBreakEngine may be shared between 19959d709d503bab6e2b61931737e662dd293b40578ccornelius * threads without synchronization.</p> 20054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 20159d709d503bab6e2b61931737e662dd293b40578ccorneliusclass LaoBreakEngine : public DictionaryBreakEngine { 20259d709d503bab6e2b61931737e662dd293b40578ccornelius private: 20354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius /** 20454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * The set of characters handled by this engine 20554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @internal 20654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 20754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 20859d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fLaoWordSet; 20959d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fEndWordSet; 21059d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fBeginWordSet; 21159d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fMarkSet; 21254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius DictionaryMatcher *fDictionary; 21354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 21454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius public: 21554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 21659d709d503bab6e2b61931737e662dd293b40578ccornelius /** 21759d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>Default constructor.</p> 21859d709d503bab6e2b61931737e662dd293b40578ccornelius * 21959d709d503bab6e2b61931737e662dd293b40578ccornelius * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 22059d709d503bab6e2b61931737e662dd293b40578ccornelius * engine is deleted. 22159d709d503bab6e2b61931737e662dd293b40578ccornelius */ 22259d709d503bab6e2b61931737e662dd293b40578ccornelius LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 22354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 22459d709d503bab6e2b61931737e662dd293b40578ccornelius /** 22559d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>Virtual destructor.</p> 22659d709d503bab6e2b61931737e662dd293b40578ccornelius */ 22759d709d503bab6e2b61931737e662dd293b40578ccornelius virtual ~LaoBreakEngine(); 22854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 22954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius protected: 23059d709d503bab6e2b61931737e662dd293b40578ccornelius /** 23159d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 23259d709d503bab6e2b61931737e662dd293b40578ccornelius * 23359d709d503bab6e2b61931737e662dd293b40578ccornelius * @param text A UText representing the text 23459d709d503bab6e2b61931737e662dd293b40578ccornelius * @param rangeStart The start of the range of dictionary characters 23559d709d503bab6e2b61931737e662dd293b40578ccornelius * @param rangeEnd The end of the range of dictionary characters 23659d709d503bab6e2b61931737e662dd293b40578ccornelius * @param foundBreaks Output of C array of int32_t break positions, or 0 23759d709d503bab6e2b61931737e662dd293b40578ccornelius * @return The number of breaks found 23859d709d503bab6e2b61931737e662dd293b40578ccornelius */ 23954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t divideUpDictionaryRange( UText *text, 24059d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t rangeStart, 24159d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t rangeEnd, 24259d709d503bab6e2b61931737e662dd293b40578ccornelius UStack &foundBreaks ) const; 24354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 24454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}; 24554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/******************************************************************* 247f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * BurmeseBreakEngine 248f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 249f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 250f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/** 251f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 252f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 253f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * 254f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>After it is constructed a BurmeseBreakEngine may be shared between 255f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * threads without synchronization.</p> 256f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 257f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusclass BurmeseBreakEngine : public DictionaryBreakEngine { 258f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius private: 259f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 260f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * The set of characters handled by this engine 261f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @internal 262f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 263f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 264f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeSet fBurmeseWordSet; 265f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeSet fEndWordSet; 266f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeSet fBeginWordSet; 267f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeSet fMarkSet; 268f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius DictionaryMatcher *fDictionary; 269f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 270f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius public: 271f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 272f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 273f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>Default constructor.</p> 274f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * 275f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 276f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * engine is deleted. 277f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 278f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 279f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 280f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 281f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>Virtual destructor.</p> 282f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 283f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual ~BurmeseBreakEngine(); 284f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 285f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius protected: 286f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 287f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>Divide up a range of known dictionary characters.</p> 288f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * 289f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @param text A UText representing the text 290f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @param rangeStart The start of the range of dictionary characters 291f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @param rangeEnd The end of the range of dictionary characters 292f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @param foundBreaks Output of C array of int32_t break positions, or 0 293f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @return The number of breaks found 294f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 295f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual int32_t divideUpDictionaryRange( UText *text, 296f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t rangeStart, 297f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t rangeEnd, 298f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UStack &foundBreaks ) const; 299f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 300f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius}; 301f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 302f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/******************************************************************* 303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * KhmerBreakEngine 304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/** 307b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 30854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>After it is constructed a KhmerBreakEngine may be shared between 311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * threads without synchronization.</p> 312b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass KhmerBreakEngine : public DictionaryBreakEngine { 314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho private: 315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 316b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The set of characters handled by this engine 317b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @internal 318b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeSet fKhmerWordSet; 321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeSet fEndWordSet; 322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeSet fBeginWordSet; 323b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeSet fMarkSet; 32454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius DictionaryMatcher *fDictionary; 325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho public: 327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 328b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>Default constructor.</p> 330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 33154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * engine is deleted. 333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 33454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>Virtual destructor.</p> 338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 339b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual ~KhmerBreakEngine(); 340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho protected: 342b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 343b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>Divide up a range of known dictionary characters.</p> 344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param text A UText representing the text 346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param rangeStart The start of the range of dictionary characters 347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param rangeEnd The end of the range of dictionary characters 348b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param foundBreaks Output of C array of int32_t break positions, or 0 349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The number of breaks found 350b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual int32_t divideUpDictionaryRange( UText *text, 352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t rangeStart, 353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t rangeEnd, 354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStack &foundBreaks ) const; 355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 35859d709d503bab6e2b61931737e662dd293b40578ccornelius#if !UCONFIG_NO_NORMALIZATION 35959d709d503bab6e2b61931737e662dd293b40578ccornelius 36059d709d503bab6e2b61931737e662dd293b40578ccornelius/******************************************************************* 36159d709d503bab6e2b61931737e662dd293b40578ccornelius * CjkBreakEngine 36259d709d503bab6e2b61931737e662dd293b40578ccornelius */ 36359d709d503bab6e2b61931737e662dd293b40578ccornelius 36459d709d503bab6e2b61931737e662dd293b40578ccornelius//indicates language/script that the CjkBreakEngine will handle 36559d709d503bab6e2b61931737e662dd293b40578ccorneliusenum LanguageType { 36659d709d503bab6e2b61931737e662dd293b40578ccornelius kKorean, 36759d709d503bab6e2b61931737e662dd293b40578ccornelius kChineseJapanese 36859d709d503bab6e2b61931737e662dd293b40578ccornelius}; 36959d709d503bab6e2b61931737e662dd293b40578ccornelius 37059d709d503bab6e2b61931737e662dd293b40578ccornelius/** 37159d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 37259d709d503bab6e2b61931737e662dd293b40578ccornelius * dictionary with costs associated with each word and 37359d709d503bab6e2b61931737e662dd293b40578ccornelius * Viterbi decoding to determine CJK-specific breaks.</p> 37459d709d503bab6e2b61931737e662dd293b40578ccornelius */ 37559d709d503bab6e2b61931737e662dd293b40578ccorneliusclass CjkBreakEngine : public DictionaryBreakEngine { 37659d709d503bab6e2b61931737e662dd293b40578ccornelius protected: 37759d709d503bab6e2b61931737e662dd293b40578ccornelius /** 37859d709d503bab6e2b61931737e662dd293b40578ccornelius * The set of characters handled by this engine 37959d709d503bab6e2b61931737e662dd293b40578ccornelius * @internal 38059d709d503bab6e2b61931737e662dd293b40578ccornelius */ 38159d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fHangulWordSet; 38259d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fHanWordSet; 38359d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fKatakanaWordSet; 38459d709d503bab6e2b61931737e662dd293b40578ccornelius UnicodeSet fHiraganaWordSet; 38559d709d503bab6e2b61931737e662dd293b40578ccornelius 386f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius DictionaryMatcher *fDictionary; 387f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const Normalizer2 *nfkcNorm2; 38859d709d503bab6e2b61931737e662dd293b40578ccornelius 38959d709d503bab6e2b61931737e662dd293b40578ccornelius public: 39059d709d503bab6e2b61931737e662dd293b40578ccornelius 39159d709d503bab6e2b61931737e662dd293b40578ccornelius /** 39259d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>Default constructor.</p> 39359d709d503bab6e2b61931737e662dd293b40578ccornelius * 39459d709d503bab6e2b61931737e662dd293b40578ccornelius * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 39559d709d503bab6e2b61931737e662dd293b40578ccornelius * engine is deleted. The DictionaryMatcher must contain costs for each word 39659d709d503bab6e2b61931737e662dd293b40578ccornelius * in order for the dictionary to work properly. 39759d709d503bab6e2b61931737e662dd293b40578ccornelius */ 39859d709d503bab6e2b61931737e662dd293b40578ccornelius CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 39959d709d503bab6e2b61931737e662dd293b40578ccornelius 40059d709d503bab6e2b61931737e662dd293b40578ccornelius /** 40159d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>Virtual destructor.</p> 40259d709d503bab6e2b61931737e662dd293b40578ccornelius */ 40359d709d503bab6e2b61931737e662dd293b40578ccornelius virtual ~CjkBreakEngine(); 40459d709d503bab6e2b61931737e662dd293b40578ccornelius 40559d709d503bab6e2b61931737e662dd293b40578ccornelius protected: 40659d709d503bab6e2b61931737e662dd293b40578ccornelius /** 40759d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 40859d709d503bab6e2b61931737e662dd293b40578ccornelius * 40959d709d503bab6e2b61931737e662dd293b40578ccornelius * @param text A UText representing the text 41059d709d503bab6e2b61931737e662dd293b40578ccornelius * @param rangeStart The start of the range of dictionary characters 41159d709d503bab6e2b61931737e662dd293b40578ccornelius * @param rangeEnd The end of the range of dictionary characters 41259d709d503bab6e2b61931737e662dd293b40578ccornelius * @param foundBreaks Output of C array of int32_t break positions, or 0 41359d709d503bab6e2b61931737e662dd293b40578ccornelius * @return The number of breaks found 41459d709d503bab6e2b61931737e662dd293b40578ccornelius */ 41559d709d503bab6e2b61931737e662dd293b40578ccornelius virtual int32_t divideUpDictionaryRange( UText *text, 41659d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t rangeStart, 41759d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t rangeEnd, 41859d709d503bab6e2b61931737e662dd293b40578ccornelius UStack &foundBreaks ) const; 41959d709d503bab6e2b61931737e662dd293b40578ccornelius 42059d709d503bab6e2b61931737e662dd293b40578ccornelius}; 42159d709d503bab6e2b61931737e662dd293b40578ccornelius 42259d709d503bab6e2b61931737e662dd293b40578ccornelius#endif 42359d709d503bab6e2b61931737e662dd293b40578ccornelius 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* DICTBE_H */ 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 428