1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
3f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Copyright (C) 2006-2014, International Business Machines Corporation   *
4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and others. All Rights Reserved.                                            *
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef DICTBE_H
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define DICTBE_H
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utext.h"
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "brkeng.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass DictionaryMatcher;
20f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusclass Normalizer2;
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*******************************************************************
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * DictionaryBreakEngine
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * dictionary to determine language-specific breaks.</p>
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>After it is constructed a DictionaryBreakEngine may be shared between
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * threads without synchronization.</p>
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass DictionaryBreakEngine : public LanguageBreakEngine {
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru private:
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * The set of characters handled by this engine
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @internal
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UnicodeSet    fSet;
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * The set of break types handled by this engine
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @internal
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  uint32_t      fTypes;
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /**
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * <p>Default constructor.</p>
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   *
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   */
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  DictionaryBreakEngine();
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public:
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /**
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * <p>Constructor setting the break types handled.</p>
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   *
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * @param breakTypes A bitmap of types handled by the engine.
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   */
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  DictionaryBreakEngine( uint32_t breakTypes );
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /**
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * <p>Virtual destructor.</p>
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   */
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual ~DictionaryBreakEngine();
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
6954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  /**
7054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * <p>Indicate whether this engine handles a particular character for
7154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * a particular kind of break.</p>
7254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   *
7354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param c A character which begins a run that the engine might handle
7454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param breakType The type of text break which the caller wants to determine
7554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @return TRUE if this engine handles the particular character and break
7654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * type.
7754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   */
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual UBool handles( UChar32 c, int32_t breakType ) const;
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  /**
8154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * <p>Find any breaks within a run in the supplied text.</p>
8254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   *
8354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param text A UText representing the text. The iterator is left at
8454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * the end of the run of characters which the engine is capable of handling
8554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * that starts from the first (or last) character in the range.
8654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param startPos The start of the run within the supplied text.
8754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param endPos The end of the run within the supplied text.
8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param reverse Whether the caller is looking for breaks in a reverse
8954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * direction.
9054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param breakType The type of break desired, or -1.
9154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param foundBreaks An allocated C array of the breaks found, if any
9254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @return The number of breaks found.
9354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   */
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual int32_t findBreaks( UText *text,
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                              int32_t startPos,
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                              int32_t endPos,
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                              UBool reverse,
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                              int32_t breakType,
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                              UStack &foundBreaks ) const;
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru protected:
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /**
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * <p>Set the character set handled by this engine.</p>
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  *
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param set A UnicodeSet of the set of characters handled by the engine
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  */
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual void setCharacters( const UnicodeSet &set );
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /**
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * <p>Set the break types handled by this engine.</p>
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  *
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param breakTypes A bitmap of types handled by the engine.
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  */
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  virtual void setBreakTypes( uint32_t breakTypes );
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /**
11854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  *
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param text A UText representing the text
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param rangeStart The start of the range of dictionary characters
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param rangeEnd The end of the range of dictionary characters
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param foundBreaks Output of C array of int32_t break positions, or 0
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @return The number of breaks found
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  */
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual int32_t divideUpDictionaryRange( UText *text,
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           int32_t rangeStart,
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           int32_t rangeEnd,
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           UStack &foundBreaks ) const = 0;
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*******************************************************************
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ThaiBreakEngine
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
13954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * dictionary and heuristics to determine Thai-specific breaks.</p>
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * <p>After it is constructed a ThaiBreakEngine may be shared between
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * threads without synchronization.</p>
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass ThaiBreakEngine : public DictionaryBreakEngine {
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru private:
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * The set of characters handled by this engine
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @internal
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UnicodeSet                fThaiWordSet;
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UnicodeSet                fEndWordSet;
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UnicodeSet                fBeginWordSet;
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UnicodeSet                fSuffixSet;
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UnicodeSet                fMarkSet;
15654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  DictionaryMatcher  *fDictionary;
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public:
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /**
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * <p>Default constructor.</p>
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   *
16354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * engine is deleted.
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   */
16654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /**
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   * <p>Virtual destructor.</p>
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   */
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual ~ThaiBreakEngine();
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru protected:
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /**
17554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  *
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param text A UText representing the text
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param rangeStart The start of the range of dictionary characters
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param rangeEnd The end of the range of dictionary characters
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @param foundBreaks Output of C array of int32_t break positions, or 0
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  * @return The number of breaks found
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  */
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  virtual int32_t divideUpDictionaryRange( UText *text,
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           int32_t rangeStart,
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           int32_t rangeEnd,
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           UStack &foundBreaks ) const;
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/*******************************************************************
19159d709d503bab6e2b61931737e662dd293b40578ccornelius * LaoBreakEngine
19254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */
19354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
19454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/**
19559d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
19659d709d503bab6e2b61931737e662dd293b40578ccornelius * dictionary and heuristics to determine Lao-specific breaks.</p>
19759d709d503bab6e2b61931737e662dd293b40578ccornelius *
19859d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>After it is constructed a LaoBreakEngine may be shared between
19959d709d503bab6e2b61931737e662dd293b40578ccornelius * threads without synchronization.</p>
20054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */
20159d709d503bab6e2b61931737e662dd293b40578ccorneliusclass LaoBreakEngine : public DictionaryBreakEngine {
20259d709d503bab6e2b61931737e662dd293b40578ccornelius private:
20354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
20454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * The set of characters handled by this engine
20554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
20654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
20754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
20859d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fLaoWordSet;
20959d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fEndWordSet;
21059d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fBeginWordSet;
21159d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fMarkSet;
21254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  DictionaryMatcher  *fDictionary;
21354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
21454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius public:
21554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
21659d709d503bab6e2b61931737e662dd293b40578ccornelius  /**
21759d709d503bab6e2b61931737e662dd293b40578ccornelius   * <p>Default constructor.</p>
21859d709d503bab6e2b61931737e662dd293b40578ccornelius   *
21959d709d503bab6e2b61931737e662dd293b40578ccornelius   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
22059d709d503bab6e2b61931737e662dd293b40578ccornelius   * engine is deleted.
22159d709d503bab6e2b61931737e662dd293b40578ccornelius   */
22259d709d503bab6e2b61931737e662dd293b40578ccornelius  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
22354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
22459d709d503bab6e2b61931737e662dd293b40578ccornelius  /**
22559d709d503bab6e2b61931737e662dd293b40578ccornelius   * <p>Virtual destructor.</p>
22659d709d503bab6e2b61931737e662dd293b40578ccornelius   */
22759d709d503bab6e2b61931737e662dd293b40578ccornelius  virtual ~LaoBreakEngine();
22854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
22954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius protected:
23059d709d503bab6e2b61931737e662dd293b40578ccornelius /**
23159d709d503bab6e2b61931737e662dd293b40578ccornelius  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
23259d709d503bab6e2b61931737e662dd293b40578ccornelius  *
23359d709d503bab6e2b61931737e662dd293b40578ccornelius  * @param text A UText representing the text
23459d709d503bab6e2b61931737e662dd293b40578ccornelius  * @param rangeStart The start of the range of dictionary characters
23559d709d503bab6e2b61931737e662dd293b40578ccornelius  * @param rangeEnd The end of the range of dictionary characters
23659d709d503bab6e2b61931737e662dd293b40578ccornelius  * @param foundBreaks Output of C array of int32_t break positions, or 0
23759d709d503bab6e2b61931737e662dd293b40578ccornelius  * @return The number of breaks found
23859d709d503bab6e2b61931737e662dd293b40578ccornelius  */
23954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  virtual int32_t divideUpDictionaryRange( UText *text,
24059d709d503bab6e2b61931737e662dd293b40578ccornelius                                           int32_t rangeStart,
24159d709d503bab6e2b61931737e662dd293b40578ccornelius                                           int32_t rangeEnd,
24259d709d503bab6e2b61931737e662dd293b40578ccornelius                                           UStack &foundBreaks ) const;
24354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
24454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius};
24554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/*******************************************************************
247f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * BurmeseBreakEngine
248f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */
249f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
250f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/**
251f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
252f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
253f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius *
254f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * <p>After it is constructed a BurmeseBreakEngine may be shared between
255f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * threads without synchronization.</p>
256f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */
257f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusclass BurmeseBreakEngine : public DictionaryBreakEngine {
258f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius private:
259f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    /**
260f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius     * The set of characters handled by this engine
261f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius     * @internal
262f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius     */
263f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
264f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  UnicodeSet                fBurmeseWordSet;
265f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  UnicodeSet                fEndWordSet;
266f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  UnicodeSet                fBeginWordSet;
267f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  UnicodeSet                fMarkSet;
268f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  DictionaryMatcher  *fDictionary;
269f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
270f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius public:
271f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
272f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  /**
273f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   * <p>Default constructor.</p>
274f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   *
275f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
276f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   * engine is deleted.
277f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   */
278f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
279f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
280f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  /**
281f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   * <p>Virtual destructor.</p>
282f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius   */
283f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  virtual ~BurmeseBreakEngine();
284f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
285f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius protected:
286f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /**
287f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  * <p>Divide up a range of known dictionary characters.</p>
288f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  *
289f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  * @param text A UText representing the text
290f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  * @param rangeStart The start of the range of dictionary characters
291f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  * @param rangeEnd The end of the range of dictionary characters
292f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  * @param foundBreaks Output of C array of int32_t break positions, or 0
293f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  * @return The number of breaks found
294f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  */
295f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  virtual int32_t divideUpDictionaryRange( UText *text,
296f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius                                           int32_t rangeStart,
297f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius                                           int32_t rangeEnd,
298f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius                                           UStack &foundBreaks ) const;
299f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
300f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius};
301f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
302f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/*******************************************************************
303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * KhmerBreakEngine
304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */
305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/**
307b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
30854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *
310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * <p>After it is constructed a KhmerBreakEngine may be shared between
311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * threads without synchronization.</p>
312b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */
313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass KhmerBreakEngine : public DictionaryBreakEngine {
314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho private:
315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /**
316b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * The set of characters handled by this engine
317b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @internal
318b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     */
319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  UnicodeSet                fKhmerWordSet;
321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  UnicodeSet                fEndWordSet;
322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  UnicodeSet                fBeginWordSet;
323b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  UnicodeSet                fMarkSet;
32454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  DictionaryMatcher  *fDictionary;
325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho public:
327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
328b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  /**
329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho   * <p>Default constructor.</p>
330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho   *
33154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho   * engine is deleted.
333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho   */
33454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  /**
337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho   * <p>Virtual destructor.</p>
338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho   */
339b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  virtual ~KhmerBreakEngine();
340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho protected:
342b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /**
343b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  * <p>Divide up a range of known dictionary characters.</p>
344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  *
345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  * @param text A UText representing the text
346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  * @param rangeStart The start of the range of dictionary characters
347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  * @param rangeEnd The end of the range of dictionary characters
348b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  * @param foundBreaks Output of C array of int32_t break positions, or 0
349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  * @return The number of breaks found
350b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  */
351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho  virtual int32_t divideUpDictionaryRange( UText *text,
352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                           int32_t rangeStart,
353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                           int32_t rangeEnd,
354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                           UStack &foundBreaks ) const;
355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
35859d709d503bab6e2b61931737e662dd293b40578ccornelius#if !UCONFIG_NO_NORMALIZATION
35959d709d503bab6e2b61931737e662dd293b40578ccornelius
36059d709d503bab6e2b61931737e662dd293b40578ccornelius/*******************************************************************
36159d709d503bab6e2b61931737e662dd293b40578ccornelius * CjkBreakEngine
36259d709d503bab6e2b61931737e662dd293b40578ccornelius */
36359d709d503bab6e2b61931737e662dd293b40578ccornelius
36459d709d503bab6e2b61931737e662dd293b40578ccornelius//indicates language/script that the CjkBreakEngine will handle
36559d709d503bab6e2b61931737e662dd293b40578ccorneliusenum LanguageType {
36659d709d503bab6e2b61931737e662dd293b40578ccornelius    kKorean,
36759d709d503bab6e2b61931737e662dd293b40578ccornelius    kChineseJapanese
36859d709d503bab6e2b61931737e662dd293b40578ccornelius};
36959d709d503bab6e2b61931737e662dd293b40578ccornelius
37059d709d503bab6e2b61931737e662dd293b40578ccornelius/**
37159d709d503bab6e2b61931737e662dd293b40578ccornelius * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
37259d709d503bab6e2b61931737e662dd293b40578ccornelius * dictionary with costs associated with each word and
37359d709d503bab6e2b61931737e662dd293b40578ccornelius * Viterbi decoding to determine CJK-specific breaks.</p>
37459d709d503bab6e2b61931737e662dd293b40578ccornelius */
37559d709d503bab6e2b61931737e662dd293b40578ccorneliusclass CjkBreakEngine : public DictionaryBreakEngine {
37659d709d503bab6e2b61931737e662dd293b40578ccornelius protected:
37759d709d503bab6e2b61931737e662dd293b40578ccornelius    /**
37859d709d503bab6e2b61931737e662dd293b40578ccornelius     * The set of characters handled by this engine
37959d709d503bab6e2b61931737e662dd293b40578ccornelius     * @internal
38059d709d503bab6e2b61931737e662dd293b40578ccornelius     */
38159d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fHangulWordSet;
38259d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fHanWordSet;
38359d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fKatakanaWordSet;
38459d709d503bab6e2b61931737e662dd293b40578ccornelius  UnicodeSet                fHiraganaWordSet;
38559d709d503bab6e2b61931737e662dd293b40578ccornelius
386f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  DictionaryMatcher        *fDictionary;
387f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius  const Normalizer2        *nfkcNorm2;
38859d709d503bab6e2b61931737e662dd293b40578ccornelius
38959d709d503bab6e2b61931737e662dd293b40578ccornelius public:
39059d709d503bab6e2b61931737e662dd293b40578ccornelius
39159d709d503bab6e2b61931737e662dd293b40578ccornelius    /**
39259d709d503bab6e2b61931737e662dd293b40578ccornelius     * <p>Default constructor.</p>
39359d709d503bab6e2b61931737e662dd293b40578ccornelius     *
39459d709d503bab6e2b61931737e662dd293b40578ccornelius     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
39559d709d503bab6e2b61931737e662dd293b40578ccornelius     * engine is deleted. The DictionaryMatcher must contain costs for each word
39659d709d503bab6e2b61931737e662dd293b40578ccornelius     * in order for the dictionary to work properly.
39759d709d503bab6e2b61931737e662dd293b40578ccornelius     */
39859d709d503bab6e2b61931737e662dd293b40578ccornelius  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
39959d709d503bab6e2b61931737e662dd293b40578ccornelius
40059d709d503bab6e2b61931737e662dd293b40578ccornelius    /**
40159d709d503bab6e2b61931737e662dd293b40578ccornelius     * <p>Virtual destructor.</p>
40259d709d503bab6e2b61931737e662dd293b40578ccornelius     */
40359d709d503bab6e2b61931737e662dd293b40578ccornelius  virtual ~CjkBreakEngine();
40459d709d503bab6e2b61931737e662dd293b40578ccornelius
40559d709d503bab6e2b61931737e662dd293b40578ccornelius protected:
40659d709d503bab6e2b61931737e662dd293b40578ccornelius    /**
40759d709d503bab6e2b61931737e662dd293b40578ccornelius     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
40859d709d503bab6e2b61931737e662dd293b40578ccornelius     *
40959d709d503bab6e2b61931737e662dd293b40578ccornelius     * @param text A UText representing the text
41059d709d503bab6e2b61931737e662dd293b40578ccornelius     * @param rangeStart The start of the range of dictionary characters
41159d709d503bab6e2b61931737e662dd293b40578ccornelius     * @param rangeEnd The end of the range of dictionary characters
41259d709d503bab6e2b61931737e662dd293b40578ccornelius     * @param foundBreaks Output of C array of int32_t break positions, or 0
41359d709d503bab6e2b61931737e662dd293b40578ccornelius     * @return The number of breaks found
41459d709d503bab6e2b61931737e662dd293b40578ccornelius     */
41559d709d503bab6e2b61931737e662dd293b40578ccornelius  virtual int32_t divideUpDictionaryRange( UText *text,
41659d709d503bab6e2b61931737e662dd293b40578ccornelius          int32_t rangeStart,
41759d709d503bab6e2b61931737e662dd293b40578ccornelius          int32_t rangeEnd,
41859d709d503bab6e2b61931737e662dd293b40578ccornelius          UStack &foundBreaks ) const;
41959d709d503bab6e2b61931737e662dd293b40578ccornelius
42059d709d503bab6e2b61931737e662dd293b40578ccornelius};
42159d709d503bab6e2b61931737e662dd293b40578ccornelius
42259d709d503bab6e2b61931737e662dd293b40578ccornelius#endif
42359d709d503bab6e2b61931737e662dd293b40578ccornelius
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /* DICTBE_H */
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
428