1/*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007 Apple Inc. All rights reserved.
4 * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
5 * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
6 * Copyright (C) 2010 Igalia S.L.
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB.  If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25#include "config.h"
26
27#include "TextBreakIterator.h"
28
29#include "GOwnPtr.h"
30#include <pango/pango.h>
31using namespace std;
32
33#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
34
35namespace WebCore {
36
37class CharacterIterator {
38public:
39    bool setText(const UChar* string, int length);
40    const gchar* getText() { return m_utf8.get(); }
41    int getLength() { return m_length; }
42    glong getSize() { return m_size; }
43    void setIndex(int index);
44    int getIndex() { return m_index; }
45    void setUTF16Index(int index);
46    int getUTF16Index() { return m_utf16Index; }
47    int getUTF16Length() { return m_utf16Length; }
48    int first();
49    int last();
50    int next();
51    int previous();
52private:
53    int characterSize(int index);
54
55    GOwnPtr<char> m_utf8;
56    int m_length;
57    long m_size;
58    int m_index;
59    int m_utf16Index;
60    int m_utf16Length;
61};
62
63int CharacterIterator::characterSize(int index)
64{
65    if (index == m_length || index < 0)
66        return 0;
67    if (m_length == m_utf16Length)
68        return 1;
69
70    gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index);
71    gunichar character = g_utf8_get_char(indexPtr);
72    return UTF8_IS_SURROGATE(character) ? 2 : 1;
73}
74
75bool CharacterIterator::setText(const UChar* string, int length)
76{
77    long utf8Size = 0;
78    m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0));
79    if (!utf8Size)
80        return false;
81
82    m_utf16Length = length;
83    m_length = g_utf8_strlen(m_utf8.get(), utf8Size);
84    m_size = utf8Size;
85    m_index = 0;
86    m_utf16Index = 0;
87
88    return true;
89}
90
91void CharacterIterator::setIndex(int index)
92{
93    if (index == m_index)
94        return;
95    if (index <= 0)
96        m_index = m_utf16Index = 0;
97    else if (index >= m_length) {
98        m_index = m_length;
99        m_utf16Index = m_utf16Length;
100    } else if (m_length == m_utf16Length)
101        m_index = m_utf16Index = index;
102    else {
103        m_index = index;
104        int utf16Index = 0;
105        int utf8Index = 0;
106        while (utf8Index < index) {
107            utf16Index += characterSize(utf8Index);
108            utf8Index++;
109        }
110        m_utf16Index = utf16Index;
111    }
112}
113
114void CharacterIterator::setUTF16Index(int index)
115{
116    if (index == m_utf16Index)
117        return;
118    if (index <= 0)
119        m_utf16Index = m_index = 0;
120    else if (index >= m_utf16Length) {
121        m_utf16Index = m_utf16Length;
122        m_index = m_length;
123    } else if (m_length == m_utf16Length)
124        m_utf16Index = m_index = index;
125    else {
126        m_utf16Index = index;
127        int utf16Index = 0;
128        int utf8Index = 0;
129        while (utf16Index < index) {
130            utf16Index += characterSize(utf8Index);
131            utf8Index++;
132        }
133        m_index = utf8Index;
134    }
135}
136
137int CharacterIterator::first()
138{
139    m_index = m_utf16Index = 0;
140    return m_index;
141}
142
143int CharacterIterator::last()
144{
145    m_index = m_length;
146    m_utf16Index = m_utf16Length;
147    return m_index;
148}
149
150int CharacterIterator::next()
151{
152    int next = m_index + 1;
153
154    if (next <= m_length) {
155        m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length);
156        m_index = next;
157    } else {
158        m_index = TextBreakDone;
159        m_utf16Index = TextBreakDone;
160    }
161
162    return m_index;
163}
164
165int CharacterIterator::previous()
166{
167    int previous = m_index - 1;
168
169    if (previous >= 0) {
170        m_utf16Index = max(m_utf16Index - characterSize(previous), 0);
171        m_index = previous;
172    } else {
173        m_index = TextBreakDone;
174        m_utf16Index = TextBreakDone;
175    }
176
177    return m_index;
178}
179
180enum UBreakIteratorType {
181    UBRK_CHARACTER,
182    UBRK_WORD,
183    UBRK_LINE,
184    UBRK_SENTENCE
185};
186
187class TextBreakIterator {
188public:
189    UBreakIteratorType m_type;
190    PangoLogAttr* m_logAttrs;
191    CharacterIterator m_charIterator;
192};
193
194static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
195    UBreakIteratorType type, const UChar* string, int length)
196{
197    if (!string)
198        return 0;
199
200    if (!createdIterator) {
201        iterator = new TextBreakIterator();
202        createdIterator = true;
203    }
204    if (!iterator)
205        return 0;
206
207    if (!iterator->m_charIterator.setText(string, length))
208        return 0;
209
210    int charLength = iterator->m_charIterator.getLength();
211
212    iterator->m_type = type;
213    if (createdIterator)
214        g_free(iterator->m_logAttrs);
215    iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1);
216    pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(),
217                        -1, 0, iterator->m_logAttrs, charLength + 1);
218
219    return iterator;
220}
221
222TextBreakIterator* characterBreakIterator(const UChar* string, int length)
223{
224    static bool createdCharacterBreakIterator = false;
225    static TextBreakIterator* staticCharacterBreakIterator;
226    return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
227}
228
229TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
230{
231    // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version.
232    return characterBreakIterator(string, length);
233}
234
235TextBreakIterator* wordBreakIterator(const UChar* string, int length)
236{
237    static bool createdWordBreakIterator = false;
238    static TextBreakIterator* staticWordBreakIterator;
239    return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length);
240}
241
242static bool createdLineBreakIterator = false;
243static TextBreakIterator* staticLineBreakIterator;
244
245TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
246{
247    TextBreakIterator* lineBreakIterator = 0;
248    if (!createdLineBreakIterator || staticLineBreakIterator) {
249        setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
250        swap(staticLineBreakIterator, lineBreakIterator);
251    }
252
253    if (!lineBreakIterator) {
254        bool createdNewLineBreakIterator = false;
255        setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
256    }
257
258    return lineBreakIterator;
259}
260
261void releaseLineBreakIterator(TextBreakIterator* iterator)
262{
263    ASSERT(createdLineBreakIterator);
264    ASSERT(iterator);
265
266    if (!staticLineBreakIterator)
267        staticLineBreakIterator = iterator;
268    else
269        delete iterator;
270}
271
272TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
273{
274    static bool createdSentenceBreakIterator = false;
275    static TextBreakIterator* staticSentenceBreakIterator;
276    return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
277}
278
279int textBreakFirst(TextBreakIterator* iterator)
280{
281    iterator->m_charIterator.first();
282    return iterator->m_charIterator.getUTF16Index();
283}
284
285int textBreakLast(TextBreakIterator* iterator)
286{
287    // TextBreakLast is not meant to find just any break according to bi->m_type
288    // but really the one near the last character.
289    // (cmp ICU documentation for ubrk_first and ubrk_last)
290    // From ICU docs for ubrk_last:
291    // "Determine the index immediately beyond the last character in the text being scanned."
292
293    // So we should advance or traverse back based on bi->m_logAttrs cursor positions.
294    // If last character position in the original string is a whitespace,
295    // traverse to the left until the first non-white character position is found
296    // and return the position of the first white-space char after this one.
297    // Otherwise return m_length, as "the first character beyond the last" is outside our string.
298
299    bool whiteSpaceAtTheEnd = true;
300    int nextWhiteSpacePos = iterator->m_charIterator.getLength();
301
302    int pos = iterator->m_charIterator.last();
303    while (pos >= 0 && whiteSpaceAtTheEnd) {
304        if (iterator->m_logAttrs[pos].is_cursor_position) {
305            if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white)
306                nextWhiteSpacePos = pos;
307        }
308        pos = iterator->m_charIterator.previous();
309    }
310    iterator->m_charIterator.setIndex(nextWhiteSpacePos);
311    return iterator->m_charIterator.getUTF16Index();
312}
313
314int textBreakNext(TextBreakIterator* iterator)
315{
316    while (iterator->m_charIterator.next() != TextBreakDone) {
317        int index = iterator->m_charIterator.getIndex();
318
319        // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol â¬,
320        // are not marked as word_start & word_end as opposed to the way ICU does it.
321        // This leads to - for example - different word selection behaviour when right clicking.
322
323        if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
324            || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
325            || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
326            || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
327            break;
328        }
329    }
330    return iterator->m_charIterator.getUTF16Index();
331}
332
333int textBreakPrevious(TextBreakIterator* iterator)
334{
335    while (iterator->m_charIterator.previous() != TextBreakDone) {
336        int index = iterator->m_charIterator.getIndex();
337
338        if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
339            || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
340            || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
341            || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
342            break;
343        }
344    }
345    return iterator->m_charIterator.getUTF16Index();
346}
347
348int textBreakPreceding(TextBreakIterator* iterator, int offset)
349{
350    if (offset > iterator->m_charIterator.getUTF16Length())
351        return TextBreakDone;
352    if (offset < 0)
353        return 0;
354    iterator->m_charIterator.setUTF16Index(offset);
355    return textBreakPrevious(iterator);
356}
357
358int textBreakFollowing(TextBreakIterator* iterator, int offset)
359{
360    if (offset > iterator->m_charIterator.getUTF16Length())
361        return TextBreakDone;
362    if (offset < 0)
363        return 0;
364    iterator->m_charIterator.setUTF16Index(offset);
365    return textBreakNext(iterator);
366}
367
368int textBreakCurrent(TextBreakIterator* iterator)
369{
370    return iterator->m_charIterator.getUTF16Index();
371}
372
373bool isTextBreak(TextBreakIterator* iterator, int offset)
374{
375    if (!offset)
376        return true;
377    if (offset > iterator->m_charIterator.getUTF16Length())
378        return false;
379
380    iterator->m_charIterator.setUTF16Index(offset);
381
382    int index = iterator->m_charIterator.getIndex();
383    iterator->m_charIterator.previous();
384    textBreakNext(iterator);
385    return iterator->m_charIterator.getIndex() == index;
386}
387
388}
389