1/* 2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> 3 * Copyright (C) 2007 Apple Inc. All rights reserved. 4 * Copyright (C) 2008 Jürg Billeter <j@bitron.ch> 5 * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com> 6 * Copyright (C) 2010 Igalia S.L. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Library General Public 10 * License as published by the Free Software Foundation; either 11 * version 2 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Library General Public License for more details. 17 * 18 * You should have received a copy of the GNU Library General Public License 19 * along with this library; see the file COPYING.LIB. If not, write to 20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21 * Boston, MA 02110-1301, USA. 22 * 23 */ 24 25#include "config.h" 26 27#include "TextBreakIterator.h" 28 29#include "GOwnPtr.h" 30#include <pango/pango.h> 31using namespace std; 32 33#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) 34 35namespace WebCore { 36 37class CharacterIterator { 38public: 39 bool setText(const UChar* string, int length); 40 const gchar* getText() { return m_utf8.get(); } 41 int getLength() { return m_length; } 42 glong getSize() { return m_size; } 43 void setIndex(int index); 44 int getIndex() { return m_index; } 45 void setUTF16Index(int index); 46 int getUTF16Index() { return m_utf16Index; } 47 int getUTF16Length() { return m_utf16Length; } 48 int first(); 49 int last(); 50 int next(); 51 int previous(); 52private: 53 int characterSize(int index); 54 55 GOwnPtr<char> m_utf8; 56 int m_length; 57 long m_size; 58 int m_index; 59 int m_utf16Index; 60 int m_utf16Length; 61}; 62 63int CharacterIterator::characterSize(int index) 64{ 65 if (index == m_length || index < 0) 66 return 0; 67 if (m_length == m_utf16Length) 68 return 1; 69 70 gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); 71 gunichar character = g_utf8_get_char(indexPtr); 72 return UTF8_IS_SURROGATE(character) ? 2 : 1; 73} 74 75bool CharacterIterator::setText(const UChar* string, int length) 76{ 77 long utf8Size = 0; 78 m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); 79 if (!utf8Size) 80 return false; 81 82 m_utf16Length = length; 83 m_length = g_utf8_strlen(m_utf8.get(), utf8Size); 84 m_size = utf8Size; 85 m_index = 0; 86 m_utf16Index = 0; 87 88 return true; 89} 90 91void CharacterIterator::setIndex(int index) 92{ 93 if (index == m_index) 94 return; 95 if (index <= 0) 96 m_index = m_utf16Index = 0; 97 else if (index >= m_length) { 98 m_index = m_length; 99 m_utf16Index = m_utf16Length; 100 } else if (m_length == m_utf16Length) 101 m_index = m_utf16Index = index; 102 else { 103 m_index = index; 104 int utf16Index = 0; 105 int utf8Index = 0; 106 while (utf8Index < index) { 107 utf16Index += characterSize(utf8Index); 108 utf8Index++; 109 } 110 m_utf16Index = utf16Index; 111 } 112} 113 114void CharacterIterator::setUTF16Index(int index) 115{ 116 if (index == m_utf16Index) 117 return; 118 if (index <= 0) 119 m_utf16Index = m_index = 0; 120 else if (index >= m_utf16Length) { 121 m_utf16Index = m_utf16Length; 122 m_index = m_length; 123 } else if (m_length == m_utf16Length) 124 m_utf16Index = m_index = index; 125 else { 126 m_utf16Index = index; 127 int utf16Index = 0; 128 int utf8Index = 0; 129 while (utf16Index < index) { 130 utf16Index += characterSize(utf8Index); 131 utf8Index++; 132 } 133 m_index = utf8Index; 134 } 135} 136 137int CharacterIterator::first() 138{ 139 m_index = m_utf16Index = 0; 140 return m_index; 141} 142 143int CharacterIterator::last() 144{ 145 m_index = m_length; 146 m_utf16Index = m_utf16Length; 147 return m_index; 148} 149 150int CharacterIterator::next() 151{ 152 int next = m_index + 1; 153 154 if (next <= m_length) { 155 m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); 156 m_index = next; 157 } else { 158 m_index = TextBreakDone; 159 m_utf16Index = TextBreakDone; 160 } 161 162 return m_index; 163} 164 165int CharacterIterator::previous() 166{ 167 int previous = m_index - 1; 168 169 if (previous >= 0) { 170 m_utf16Index = max(m_utf16Index - characterSize(previous), 0); 171 m_index = previous; 172 } else { 173 m_index = TextBreakDone; 174 m_utf16Index = TextBreakDone; 175 } 176 177 return m_index; 178} 179 180enum UBreakIteratorType { 181 UBRK_CHARACTER, 182 UBRK_WORD, 183 UBRK_LINE, 184 UBRK_SENTENCE 185}; 186 187class TextBreakIterator { 188public: 189 UBreakIteratorType m_type; 190 PangoLogAttr* m_logAttrs; 191 CharacterIterator m_charIterator; 192}; 193 194static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, 195 UBreakIteratorType type, const UChar* string, int length) 196{ 197 if (!string) 198 return 0; 199 200 if (!createdIterator) { 201 iterator = new TextBreakIterator(); 202 createdIterator = true; 203 } 204 if (!iterator) 205 return 0; 206 207 if (!iterator->m_charIterator.setText(string, length)) 208 return 0; 209 210 int charLength = iterator->m_charIterator.getLength(); 211 212 iterator->m_type = type; 213 if (createdIterator) 214 g_free(iterator->m_logAttrs); 215 iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); 216 pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), 217 -1, 0, iterator->m_logAttrs, charLength + 1); 218 219 return iterator; 220} 221 222TextBreakIterator* characterBreakIterator(const UChar* string, int length) 223{ 224 static bool createdCharacterBreakIterator = false; 225 static TextBreakIterator* staticCharacterBreakIterator; 226 return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length); 227} 228 229TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 230{ 231 // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. 232 return characterBreakIterator(string, length); 233} 234 235TextBreakIterator* wordBreakIterator(const UChar* string, int length) 236{ 237 static bool createdWordBreakIterator = false; 238 static TextBreakIterator* staticWordBreakIterator; 239 return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); 240} 241 242static bool createdLineBreakIterator = false; 243static TextBreakIterator* staticLineBreakIterator; 244 245TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length) 246{ 247 TextBreakIterator* lineBreakIterator = 0; 248 if (!createdLineBreakIterator || staticLineBreakIterator) { 249 setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); 250 swap(staticLineBreakIterator, lineBreakIterator); 251 } 252 253 if (!lineBreakIterator) { 254 bool createdNewLineBreakIterator = false; 255 setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length); 256 } 257 258 return lineBreakIterator; 259} 260 261void releaseLineBreakIterator(TextBreakIterator* iterator) 262{ 263 ASSERT(createdLineBreakIterator); 264 ASSERT(iterator); 265 266 if (!staticLineBreakIterator) 267 staticLineBreakIterator = iterator; 268 else 269 delete iterator; 270} 271 272TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 273{ 274 static bool createdSentenceBreakIterator = false; 275 static TextBreakIterator* staticSentenceBreakIterator; 276 return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); 277} 278 279int textBreakFirst(TextBreakIterator* iterator) 280{ 281 iterator->m_charIterator.first(); 282 return iterator->m_charIterator.getUTF16Index(); 283} 284 285int textBreakLast(TextBreakIterator* iterator) 286{ 287 // TextBreakLast is not meant to find just any break according to bi->m_type 288 // but really the one near the last character. 289 // (cmp ICU documentation for ubrk_first and ubrk_last) 290 // From ICU docs for ubrk_last: 291 // "Determine the index immediately beyond the last character in the text being scanned." 292 293 // So we should advance or traverse back based on bi->m_logAttrs cursor positions. 294 // If last character position in the original string is a whitespace, 295 // traverse to the left until the first non-white character position is found 296 // and return the position of the first white-space char after this one. 297 // Otherwise return m_length, as "the first character beyond the last" is outside our string. 298 299 bool whiteSpaceAtTheEnd = true; 300 int nextWhiteSpacePos = iterator->m_charIterator.getLength(); 301 302 int pos = iterator->m_charIterator.last(); 303 while (pos >= 0 && whiteSpaceAtTheEnd) { 304 if (iterator->m_logAttrs[pos].is_cursor_position) { 305 if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) 306 nextWhiteSpacePos = pos; 307 } 308 pos = iterator->m_charIterator.previous(); 309 } 310 iterator->m_charIterator.setIndex(nextWhiteSpacePos); 311 return iterator->m_charIterator.getUTF16Index(); 312} 313 314int textBreakNext(TextBreakIterator* iterator) 315{ 316 while (iterator->m_charIterator.next() != TextBreakDone) { 317 int index = iterator->m_charIterator.getIndex(); 318 319 // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol â¬, 320 // are not marked as word_start & word_end as opposed to the way ICU does it. 321 // This leads to - for example - different word selection behaviour when right clicking. 322 323 if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) 324 || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) 325 || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) 326 || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { 327 break; 328 } 329 } 330 return iterator->m_charIterator.getUTF16Index(); 331} 332 333int textBreakPrevious(TextBreakIterator* iterator) 334{ 335 while (iterator->m_charIterator.previous() != TextBreakDone) { 336 int index = iterator->m_charIterator.getIndex(); 337 338 if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) 339 || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) 340 || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) 341 || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { 342 break; 343 } 344 } 345 return iterator->m_charIterator.getUTF16Index(); 346} 347 348int textBreakPreceding(TextBreakIterator* iterator, int offset) 349{ 350 if (offset > iterator->m_charIterator.getUTF16Length()) 351 return TextBreakDone; 352 if (offset < 0) 353 return 0; 354 iterator->m_charIterator.setUTF16Index(offset); 355 return textBreakPrevious(iterator); 356} 357 358int textBreakFollowing(TextBreakIterator* iterator, int offset) 359{ 360 if (offset > iterator->m_charIterator.getUTF16Length()) 361 return TextBreakDone; 362 if (offset < 0) 363 return 0; 364 iterator->m_charIterator.setUTF16Index(offset); 365 return textBreakNext(iterator); 366} 367 368int textBreakCurrent(TextBreakIterator* iterator) 369{ 370 return iterator->m_charIterator.getUTF16Index(); 371} 372 373bool isTextBreak(TextBreakIterator* iterator, int offset) 374{ 375 if (!offset) 376 return true; 377 if (offset > iterator->m_charIterator.getUTF16Length()) 378 return false; 379 380 iterator->m_charIterator.setUTF16Index(offset); 381 382 int index = iterator->m_charIterator.getIndex(); 383 iterator->m_charIterator.previous(); 384 textBreakNext(iterator); 385 return iterator->m_charIterator.getIndex() == index; 386} 387 388} 389