1/* 2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> 3 * Copyright (C) 2007 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22#include "config.h" 23#include "TextBreakIterator.h" 24 25#include "PlatformString.h" 26#include "TextBreakIteratorInternalICU.h" 27#include <unicode/ubrk.h> 28#include <wtf/Assertions.h> 29 30using namespace std; 31 32namespace WebCore { 33 34static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, 35 UBreakIteratorType type, const UChar* string, int length) 36{ 37 if (!string) 38 return 0; 39 40 if (!createdIterator) { 41 UErrorCode openStatus = U_ZERO_ERROR; 42 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus)); 43 createdIterator = true; 44 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 45 } 46 if (!iterator) 47 return 0; 48 49 UErrorCode setTextStatus = U_ZERO_ERROR; 50 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); 51 if (U_FAILURE(setTextStatus)) 52 return 0; 53 54 return iterator; 55} 56 57TextBreakIterator* characterBreakIterator(const UChar* string, int length) 58{ 59 static bool createdCharacterBreakIterator = false; 60 static TextBreakIterator* staticCharacterBreakIterator; 61 return setUpIterator(createdCharacterBreakIterator, 62 staticCharacterBreakIterator, UBRK_CHARACTER, string, length); 63} 64 65TextBreakIterator* wordBreakIterator(const UChar* string, int length) 66{ 67 static bool createdWordBreakIterator = false; 68 static TextBreakIterator* staticWordBreakIterator; 69 return setUpIterator(createdWordBreakIterator, 70 staticWordBreakIterator, UBRK_WORD, string, length); 71} 72 73static bool createdLineBreakIterator = false; 74static TextBreakIterator* staticLineBreakIterator; 75 76TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length) 77{ 78 TextBreakIterator* lineBreakIterator = 0; 79 if (!createdLineBreakIterator || staticLineBreakIterator) { 80 setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); 81 swap(staticLineBreakIterator, lineBreakIterator); 82 } 83 84 if (!lineBreakIterator) { 85 bool createdNewLineBreakIterator = false; 86 setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length); 87 } 88 89 return lineBreakIterator; 90} 91 92void releaseLineBreakIterator(TextBreakIterator* iterator) 93{ 94 ASSERT(createdLineBreakIterator); 95 ASSERT(iterator); 96 97 if (!staticLineBreakIterator) 98 staticLineBreakIterator = iterator; 99 else 100 ubrk_close(reinterpret_cast<UBreakIterator*>(iterator)); 101} 102 103TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 104{ 105 static bool createdSentenceBreakIterator = false; 106 static TextBreakIterator* staticSentenceBreakIterator; 107 return setUpIterator(createdSentenceBreakIterator, 108 staticSentenceBreakIterator, UBRK_SENTENCE, string, length); 109} 110 111int textBreakFirst(TextBreakIterator* iterator) 112{ 113 return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator)); 114} 115 116int textBreakLast(TextBreakIterator* iterator) 117{ 118 return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator)); 119} 120 121int textBreakNext(TextBreakIterator* iterator) 122{ 123 return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator)); 124} 125 126int textBreakPrevious(TextBreakIterator* iterator) 127{ 128 return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator)); 129} 130 131int textBreakPreceding(TextBreakIterator* iterator, int pos) 132{ 133 return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos); 134} 135 136int textBreakFollowing(TextBreakIterator* iterator, int pos) 137{ 138 return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos); 139} 140 141int textBreakCurrent(TextBreakIterator* iterator) 142{ 143 return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator)); 144} 145 146bool isTextBreak(TextBreakIterator* iterator, int position) 147{ 148 return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position); 149} 150 151#ifndef BUILDING_ON_TIGER 152static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator, 153 const char* breakRules, const UChar* string, int length) 154{ 155 if (!string) 156 return 0; 157 158 if (!createdIterator) { 159 UParseError parseStatus; 160 UErrorCode openStatus = U_ZERO_ERROR; 161 String rules(breakRules); 162 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus)); 163 createdIterator = true; 164 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 165 } 166 if (!iterator) 167 return 0; 168 169 UErrorCode setTextStatus = U_ZERO_ERROR; 170 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); 171 if (U_FAILURE(setTextStatus)) 172 return 0; 173 174 return iterator; 175} 176#endif // BUILDING_ON_TIGER 177 178TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 179{ 180#ifdef BUILDING_ON_TIGER 181 // ICU 3.2 cannot compile the below rules. 182 return characterBreakIterator(string, length); 183#else 184 // This rule set is based on character-break iterator rules of ICU 4.0 185 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. 186 // The major differences from the original ones are listed below: 187 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; 188 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); 189 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; 190 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. 191 static const char* kRules = 192 "$CR = [\\p{Grapheme_Cluster_Break = CR}];" 193 "$LF = [\\p{Grapheme_Cluster_Break = LF}];" 194 "$Control = [\\p{Grapheme_Cluster_Break = Control}];" 195 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks 196 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" 197 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" 198 "$L = [\\p{Grapheme_Cluster_Break = L}];" 199 "$V = [\\p{Grapheme_Cluster_Break = V}];" 200 "$T = [\\p{Grapheme_Cluster_Break = T}];" 201 "$LV = [\\p{Grapheme_Cluster_Break = LV}];" 202 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" 203 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha 204 "$HinV = \\u094D;" // Devanagari Sign Virama 205 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha 206 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha 207 "$BenV = \\u09CD;" // Bengali Sign Virama 208 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha 209 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha 210 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama 211 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha 212 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha 213 "$GujV = \\u0ACD;" // Gujarati Sign Virama 214 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha 215 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha 216 "$OriV = \\u0B4D;" // Oriya Sign Virama 217 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha 218 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha 219 "$TelV = \\u0C4D;" // Telugu Sign Virama 220 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha 221 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha 222 "$KanV = \\u0CCD;" // Kannada Sign Virama 223 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha 224 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha 225 "$MalV = \\u0D4D;" // Malayalam Sign Virama 226 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha 227 "!!chain;" 228 "!!forward;" 229 "$CR $LF;" 230 "$L ($L | $V | $LV | $LVT);" 231 "($LV | $V) ($V | $T);" 232 "($LVT | $T) $T;" 233 "[^$Control $CR $LF] $Extend;" 234 "[^$Control $CR $LF] $SpacingMark;" 235 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) 236 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) 237 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) 238 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) 239 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) 240 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) 241 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) 242 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) 243 "!!reverse;" 244 "$LF $CR;" 245 "($L | $V | $LV | $LVT) $L;" 246 "($V | $T) ($LV | $V);" 247 "$T ($LVT | $T);" 248 "$Extend [^$Control $CR $LF];" 249 "$SpacingMark [^$Control $CR $LF];" 250 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) 251 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) 252 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) 253 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) 254 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) 255 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) 256 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) 257 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) 258 "!!safe_reverse;" 259 "!!safe_forward;"; 260 static bool createdCursorMovementIterator = false; 261 static TextBreakIterator* staticCursorMovementIterator; 262 return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); 263#endif // BUILDING_ON_TIGER 264} 265 266} 267