1/*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB.  If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22#include "config.h"
23#include "TextBreakIterator.h"
24
25#include "PlatformString.h"
26#include "TextBreakIteratorInternalICU.h"
27#include <unicode/ubrk.h>
28#include <wtf/Assertions.h>
29
30using namespace std;
31
32namespace WebCore {
33
34static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
35    UBreakIteratorType type, const UChar* string, int length)
36{
37    if (!string)
38        return 0;
39
40    if (!createdIterator) {
41        UErrorCode openStatus = U_ZERO_ERROR;
42        iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
43        createdIterator = true;
44        ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
45    }
46    if (!iterator)
47        return 0;
48
49    UErrorCode setTextStatus = U_ZERO_ERROR;
50    ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
51    if (U_FAILURE(setTextStatus))
52        return 0;
53
54    return iterator;
55}
56
57TextBreakIterator* characterBreakIterator(const UChar* string, int length)
58{
59    static bool createdCharacterBreakIterator = false;
60    static TextBreakIterator* staticCharacterBreakIterator;
61    return setUpIterator(createdCharacterBreakIterator,
62        staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
63}
64
65TextBreakIterator* wordBreakIterator(const UChar* string, int length)
66{
67    static bool createdWordBreakIterator = false;
68    static TextBreakIterator* staticWordBreakIterator;
69    return setUpIterator(createdWordBreakIterator,
70        staticWordBreakIterator, UBRK_WORD, string, length);
71}
72
73static bool createdLineBreakIterator = false;
74static TextBreakIterator* staticLineBreakIterator;
75
76TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
77{
78    TextBreakIterator* lineBreakIterator = 0;
79    if (!createdLineBreakIterator || staticLineBreakIterator) {
80        setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
81        swap(staticLineBreakIterator, lineBreakIterator);
82    }
83
84    if (!lineBreakIterator) {
85        bool createdNewLineBreakIterator = false;
86        setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
87    }
88
89    return lineBreakIterator;
90}
91
92void releaseLineBreakIterator(TextBreakIterator* iterator)
93{
94    ASSERT(createdLineBreakIterator);
95    ASSERT(iterator);
96
97    if (!staticLineBreakIterator)
98        staticLineBreakIterator = iterator;
99    else
100        ubrk_close(reinterpret_cast<UBreakIterator*>(iterator));
101}
102
103TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
104{
105    static bool createdSentenceBreakIterator = false;
106    static TextBreakIterator* staticSentenceBreakIterator;
107    return setUpIterator(createdSentenceBreakIterator,
108        staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
109}
110
111int textBreakFirst(TextBreakIterator* iterator)
112{
113    return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
114}
115
116int textBreakLast(TextBreakIterator* iterator)
117{
118    return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
119}
120
121int textBreakNext(TextBreakIterator* iterator)
122{
123    return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
124}
125
126int textBreakPrevious(TextBreakIterator* iterator)
127{
128    return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
129}
130
131int textBreakPreceding(TextBreakIterator* iterator, int pos)
132{
133    return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
134}
135
136int textBreakFollowing(TextBreakIterator* iterator, int pos)
137{
138    return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
139}
140
141int textBreakCurrent(TextBreakIterator* iterator)
142{
143    return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
144}
145
146bool isTextBreak(TextBreakIterator* iterator, int position)
147{
148    return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
149}
150
151#ifndef BUILDING_ON_TIGER
152static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
153    const char* breakRules, const UChar* string, int length)
154{
155    if (!string)
156        return 0;
157
158    if (!createdIterator) {
159        UParseError parseStatus;
160        UErrorCode openStatus = U_ZERO_ERROR;
161        String rules(breakRules);
162        iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
163        createdIterator = true;
164        ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
165    }
166    if (!iterator)
167        return 0;
168
169    UErrorCode setTextStatus = U_ZERO_ERROR;
170    ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
171    if (U_FAILURE(setTextStatus))
172        return 0;
173
174    return iterator;
175}
176#endif // BUILDING_ON_TIGER
177
178TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
179{
180#ifdef BUILDING_ON_TIGER
181    // ICU 3.2 cannot compile the below rules.
182    return characterBreakIterator(string, length);
183#else
184    // This rule set is based on character-break iterator rules of ICU 4.0
185    // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
186    // The major differences from the original ones are listed below:
187    // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
188    // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
189    // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
190    // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
191    static const char* kRules =
192        "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
193        "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
194        "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
195        "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
196        "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
197        "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
198        "$L       = [\\p{Grapheme_Cluster_Break = L}];"
199        "$V       = [\\p{Grapheme_Cluster_Break = V}];"
200        "$T       = [\\p{Grapheme_Cluster_Break = T}];"
201        "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
202        "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
203        "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
204        "$HinV    = \\u094D;"              // Devanagari Sign Virama
205        "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
206        "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
207        "$BenV    = \\u09CD;"              // Bengali Sign Virama
208        "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
209        "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
210        "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
211        "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
212        "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
213        "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
214        "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
215        "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
216        "$OriV    = \\u0B4D;"              // Oriya Sign Virama
217        "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
218        "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
219        "$TelV    = \\u0C4D;"              // Telugu Sign Virama
220        "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
221        "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
222        "$KanV    = \\u0CCD;"              // Kannada Sign Virama
223        "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
224        "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
225        "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
226        "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
227        "!!chain;"
228        "!!forward;"
229        "$CR $LF;"
230        "$L ($L | $V | $LV | $LVT);"
231        "($LV | $V) ($V | $T);"
232        "($LVT | $T) $T;"
233        "[^$Control $CR $LF] $Extend;"
234        "[^$Control $CR $LF] $SpacingMark;"
235        "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
236        "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
237        "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
238        "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
239        "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
240        "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
241        "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
242        "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
243        "!!reverse;"
244        "$LF $CR;"
245        "($L | $V | $LV | $LVT) $L;"
246        "($V | $T) ($LV | $V);"
247        "$T ($LVT | $T);"
248        "$Extend      [^$Control $CR $LF];"
249        "$SpacingMark [^$Control $CR $LF];"
250        "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
251        "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
252        "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
253        "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
254        "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
255        "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
256        "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
257        "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
258        "!!safe_reverse;"
259        "!!safe_forward;";
260    static bool createdCursorMovementIterator = false;
261    static TextBreakIterator* staticCursorMovementIterator;
262    return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
263#endif // BUILDING_ON_TIGER
264}
265
266}
267