1/*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB.  If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22#include "config.h"
23#include "platform/text/TextBreakIterator.h"
24
25#include "platform/text/TextBreakIteratorInternalICU.h"
26#include "wtf/Assertions.h"
27#include "wtf/HashMap.h"
28#include "wtf/PassOwnPtr.h"
29#include "wtf/ThreadSpecific.h"
30#include "wtf/ThreadingPrimitives.h"
31#include "wtf/text/AtomicString.h"
32#include "wtf/text/CString.h"
33#include "wtf/text/WTFString.h"
34#include <unicode/rbbi.h>
35#include <unicode/ubrk.h>
36
37using namespace WTF;
38
39namespace blink {
40
41class LineBreakIteratorPool {
42    WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
43public:
44    static LineBreakIteratorPool& sharedPool()
45    {
46        static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
47        return **pool;
48    }
49
50    static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
51
52    icu::BreakIterator* take(const AtomicString& locale)
53    {
54        icu::BreakIterator* iterator = 0;
55        for (size_t i = 0; i < m_pool.size(); ++i) {
56            if (m_pool[i].first == locale) {
57                iterator = m_pool[i].second;
58                m_pool.remove(i);
59                break;
60            }
61        }
62
63        if (!iterator) {
64            UErrorCode openStatus = U_ZERO_ERROR;
65            bool localeIsEmpty = locale.isEmpty();
66            iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
67            // locale comes from a web page and it can be invalid, leading ICU
68            // to fail, in which case we fall back to the default locale.
69            if (!localeIsEmpty && U_FAILURE(openStatus)) {
70                openStatus = U_ZERO_ERROR;
71                iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
72            }
73
74            if (U_FAILURE(openStatus)) {
75                WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
76                return 0;
77            }
78        }
79
80        ASSERT(!m_vendedIterators.contains(iterator));
81        m_vendedIterators.set(iterator, locale);
82        return iterator;
83    }
84
85    void put(icu::BreakIterator* iterator)
86    {
87        ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
88
89        if (m_pool.size() == capacity) {
90            delete(m_pool[0].second);
91            m_pool.remove(0);
92        }
93
94        m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
95    }
96
97private:
98    LineBreakIteratorPool() { }
99
100    static const size_t capacity = 4;
101
102    typedef pair<AtomicString, icu::BreakIterator*> Entry;
103    typedef Vector<Entry, capacity> Pool;
104    Pool m_pool;
105    HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
106
107    friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
108};
109
110enum TextContext { NoContext, PriorContext, PrimaryContext };
111
112const int textBufferCapacity = 16;
113
114typedef struct {
115    UText text;
116    UChar buffer[textBufferCapacity];
117} UTextWithBuffer;
118
119static inline int64_t textPinIndex(int64_t& index, int64_t limit)
120{
121    if (index < 0)
122        index = 0;
123    else if (index > limit)
124        index = limit;
125    return index;
126}
127
128static inline int64_t textNativeLength(UText* text)
129{
130    return text->a + text->b;
131}
132
133// Relocate pointer from source into destination as required.
134static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
135{
136    if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
137        // Pointer references source extra buffer.
138        pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
139    } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
140        // Pointer references source text structure, but not source extra buffer.
141        pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
142    }
143}
144
145static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
146{
147    ASSERT_UNUSED(deep, !deep);
148    if (U_FAILURE(*status))
149        return 0;
150    int32_t extraSize = source->extraSize;
151    destination = utext_setup(destination, extraSize, status);
152    if (U_FAILURE(*status))
153        return destination;
154    void* extraNew = destination->pExtra;
155    int32_t flags = destination->flags;
156    int sizeToCopy = std::min(source->sizeOfStruct, destination->sizeOfStruct);
157    memcpy(destination, source, sizeToCopy);
158    destination->pExtra = extraNew;
159    destination->flags = flags;
160    memcpy(destination->pExtra, source->pExtra, extraSize);
161    textFixPointer(source, destination, destination->context);
162    textFixPointer(source, destination, destination->p);
163    textFixPointer(source, destination, destination->q);
164    ASSERT(!destination->r);
165    const void * chunkContents = static_cast<const void*>(destination->chunkContents);
166    textFixPointer(source, destination, chunkContents);
167    destination->chunkContents = static_cast<const UChar*>(chunkContents);
168    return destination;
169}
170
171static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
172{
173    // In the present context, this text provider is used only with ICU functions
174    // that do not perform an extract operation.
175    ASSERT_NOT_REACHED();
176    *errorCode = U_UNSUPPORTED_ERROR;
177    return 0;
178}
179
180static void textClose(UText* text)
181{
182    text->context = 0;
183}
184
185static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
186{
187    if (!text->b || nativeIndex > text->b)
188        return PrimaryContext;
189    if (nativeIndex == text->b)
190        return forward ? PrimaryContext : PriorContext;
191    return PriorContext;
192}
193
194static inline TextContext textLatin1GetCurrentContext(const UText* text)
195{
196    if (!text->chunkContents)
197        return NoContext;
198    return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
199}
200
201static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
202{
203    ASSERT(text->chunkContents == text->pExtra);
204    if (forward) {
205        ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
206        text->chunkNativeStart = nativeIndex;
207        text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
208        if (text->chunkNativeLimit > nativeLength)
209            text->chunkNativeLimit = nativeLength;
210    } else {
211        ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
212        text->chunkNativeLimit = nativeIndex;
213        text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
214        if (text->chunkNativeStart < text->b)
215            text->chunkNativeStart = text->b;
216    }
217    int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
218    // Ensure chunk length is well defined if computed length exceeds int32_t range.
219    ASSERT(length <= std::numeric_limits<int32_t>::max());
220    text->chunkLength = length <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
221    text->nativeIndexingLimit = text->chunkLength;
222    text->chunkOffset = forward ? 0 : text->chunkLength;
223    StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
224}
225
226static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
227{
228    ASSERT(!text->chunkContents || text->chunkContents == text->q);
229    text->chunkContents = static_cast<const UChar*>(text->pExtra);
230    textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
231}
232
233static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
234{
235    ASSERT(text->chunkContents == text->q);
236    ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
237    ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
238    ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
239    text->chunkNativeStart = 0;
240    text->chunkNativeLimit = text->b;
241    text->chunkLength = text->b;
242    text->nativeIndexingLimit = text->chunkLength;
243    int64_t offset = nativeIndex - text->chunkNativeStart;
244    // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
245    ASSERT(offset <= std::numeric_limits<int32_t>::max());
246    text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
247}
248
249static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
250{
251    ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
252    text->chunkContents = static_cast<const UChar*>(text->q);
253    textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
254}
255
256static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
257{
258    if (forward) {
259        if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
260            int64_t offset = nativeIndex - text->chunkNativeStart;
261            // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
262            ASSERT(offset <= std::numeric_limits<int32_t>::max());
263            text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
264            isAccessible = TRUE;
265            return true;
266        }
267        if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
268            text->chunkOffset = text->chunkLength;
269            isAccessible = FALSE;
270            return true;
271        }
272    } else {
273        if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
274            int64_t offset = nativeIndex - text->chunkNativeStart;
275            // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
276            ASSERT(offset <= std::numeric_limits<int32_t>::max());
277            text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
278            isAccessible = TRUE;
279            return true;
280        }
281        if (nativeIndex <= 0 && !text->chunkNativeStart) {
282            text->chunkOffset = 0;
283            isAccessible = FALSE;
284            return true;
285        }
286    }
287    return false;
288}
289
290static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
291{
292    if (!text->context)
293        return FALSE;
294    int64_t nativeLength = textNativeLength(text);
295    UBool isAccessible;
296    if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
297        return isAccessible;
298    nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
299    TextContext currentContext = textLatin1GetCurrentContext(text);
300    TextContext newContext = textGetContext(text, nativeIndex, forward);
301    ASSERT(newContext != NoContext);
302    if (newContext == currentContext) {
303        if (currentContext == PrimaryContext) {
304            textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
305        } else {
306            textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
307        }
308    } else if (newContext == PrimaryContext) {
309        textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
310    } else {
311        ASSERT(newContext == PriorContext);
312        textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
313    }
314    return TRUE;
315}
316
317static const struct UTextFuncs textLatin1Funcs = {
318    sizeof(UTextFuncs),
319    0, 0, 0,
320    textClone,
321    textNativeLength,
322    textLatin1Access,
323    textExtract,
324    0, 0, 0, 0,
325    textClose,
326    0, 0, 0,
327};
328
329static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
330{
331    text->pFuncs = funcs;
332    text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
333    text->context = string;
334    text->p = string;
335    text->a = length;
336    text->q = priorContext;
337    text->b = priorContextLength;
338}
339
340static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
341{
342    if (U_FAILURE(*status))
343        return 0;
344
345    if (!string || length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) {
346        *status = U_ILLEGAL_ARGUMENT_ERROR;
347        return 0;
348    }
349    UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
350    if (U_FAILURE(*status)) {
351        ASSERT(!text);
352        return 0;
353    }
354    textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
355    return text;
356}
357
358static inline TextContext textUTF16GetCurrentContext(const UText* text)
359{
360    if (!text->chunkContents)
361        return NoContext;
362    return text->chunkContents == text->p ? PrimaryContext : PriorContext;
363}
364
365static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
366{
367    ASSERT(text->chunkContents == text->p);
368    ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
369    ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
370    text->chunkNativeStart = text->b;
371    text->chunkNativeLimit = nativeLength;
372    int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
373    // Ensure chunk length is well defined if computed length exceeds int32_t range.
374    ASSERT(length <= std::numeric_limits<int32_t>::max());
375    text->chunkLength = length <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
376    text->nativeIndexingLimit = text->chunkLength;
377    int64_t offset = nativeIndex - text->chunkNativeStart;
378    // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
379    ASSERT(offset <= std::numeric_limits<int32_t>::max());
380    text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
381}
382
383static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
384{
385    ASSERT(!text->chunkContents || text->chunkContents == text->q);
386    text->chunkContents = static_cast<const UChar*>(text->p);
387    textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
388}
389
390static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
391{
392    ASSERT(text->chunkContents == text->q);
393    ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
394    ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
395    ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
396    text->chunkNativeStart = 0;
397    text->chunkNativeLimit = text->b;
398    text->chunkLength = text->b;
399    text->nativeIndexingLimit = text->chunkLength;
400    int64_t offset = nativeIndex - text->chunkNativeStart;
401    // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
402    ASSERT(offset <= std::numeric_limits<int32_t>::max());
403    text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
404}
405
406static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
407{
408    ASSERT(!text->chunkContents || text->chunkContents == text->p);
409    text->chunkContents = static_cast<const UChar*>(text->q);
410    textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
411}
412
413static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
414{
415    if (!text->context)
416        return FALSE;
417    int64_t nativeLength = textNativeLength(text);
418    UBool isAccessible;
419    if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
420        return isAccessible;
421    nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
422    TextContext currentContext = textUTF16GetCurrentContext(text);
423    TextContext newContext = textGetContext(text, nativeIndex, forward);
424    ASSERT(newContext != NoContext);
425    if (newContext == currentContext) {
426        if (currentContext == PrimaryContext) {
427            textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
428        } else {
429            textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
430        }
431    } else if (newContext == PrimaryContext) {
432        textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
433    } else {
434        ASSERT(newContext == PriorContext);
435        textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
436    }
437    return TRUE;
438}
439
440static const struct UTextFuncs textUTF16Funcs = {
441    sizeof(UTextFuncs),
442    0, 0, 0,
443    textClone,
444    textNativeLength,
445    textUTF16Access,
446    textExtract,
447    0, 0, 0, 0,
448    textClose,
449    0, 0, 0,
450};
451
452static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
453{
454    if (U_FAILURE(*status))
455        return 0;
456
457    if (!string || length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) {
458        *status = U_ILLEGAL_ARGUMENT_ERROR;
459        return 0;
460    }
461
462    text = utext_setup(text, 0, status);
463    if (U_FAILURE(*status)) {
464        ASSERT(!text);
465        return 0;
466    }
467    textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
468    return text;
469}
470
471static UText emptyText = UTEXT_INITIALIZER;
472
473static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
474{
475    UErrorCode errorCode = U_ZERO_ERROR;
476    static TextBreakIterator* breakIter = 0;
477    if (!breakIter) {
478        breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
479        ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
480        if (!breakIter)
481            return 0;
482    }
483
484    UTextWithBuffer textLocal;
485    textLocal.text = emptyText;
486    textLocal.text.extraSize = sizeof(textLocal.buffer);
487    textLocal.text.pExtra = textLocal.buffer;
488
489    UErrorCode openStatus = U_ZERO_ERROR;
490    UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
491    if (U_FAILURE(openStatus)) {
492        WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
493        return 0;
494    }
495
496    UErrorCode setTextStatus = U_ZERO_ERROR;
497    breakIter->setText(text, setTextStatus);
498    if (U_FAILURE(setTextStatus))
499        WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
500
501    utext_close(text);
502
503    return breakIter;
504}
505
506static void setText16(TextBreakIterator* iter, const UChar* string, int length)
507{
508    UErrorCode errorCode = U_ZERO_ERROR;
509    UText uText = UTEXT_INITIALIZER;
510    utext_openUChars(&uText, string, length, &errorCode);
511    if (U_FAILURE(errorCode))
512        return;
513    iter->setText(&uText, errorCode);
514}
515
516TextBreakIterator* wordBreakIterator(const UChar* string, int length)
517{
518    UErrorCode errorCode = U_ZERO_ERROR;
519    static TextBreakIterator* breakIter = 0;
520    if (!breakIter) {
521        breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
522        ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
523        if (!breakIter)
524            return 0;
525    }
526    setText16(breakIter, string, length);
527    return breakIter;
528}
529
530TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
531{
532    if (string.isEmpty())
533        return 0;
534    if (string.is8Bit())
535        return wordBreakIterator(string.characters8() + start, length);
536    return wordBreakIterator(string.characters16() + start, length);
537}
538
539TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
540{
541    TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
542    if (!iterator)
543        return 0;
544
545    UTextWithBuffer textLocal;
546    textLocal.text = emptyText;
547    textLocal.text.extraSize = sizeof(textLocal.buffer);
548    textLocal.text.pExtra = textLocal.buffer;
549
550    UErrorCode openStatus = U_ZERO_ERROR;
551    UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
552    if (U_FAILURE(openStatus)) {
553        WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
554        return 0;
555    }
556
557    UErrorCode setTextStatus = U_ZERO_ERROR;
558    iterator->setText(text, setTextStatus);
559    if (U_FAILURE(setTextStatus)) {
560        WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
561        return 0;
562    }
563
564    utext_close(text);
565
566    return iterator;
567}
568
569TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
570{
571    TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
572    if (!iterator)
573        return 0;
574
575    UText textLocal = UTEXT_INITIALIZER;
576
577    UErrorCode openStatus = U_ZERO_ERROR;
578    UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
579    if (U_FAILURE(openStatus)) {
580        WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
581        return 0;
582    }
583
584    UErrorCode setTextStatus = U_ZERO_ERROR;
585    iterator->setText(text, setTextStatus);
586    if (U_FAILURE(setTextStatus)) {
587        WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
588        return 0;
589    }
590
591    utext_close(text);
592
593    return iterator;
594}
595
596void releaseLineBreakIterator(TextBreakIterator* iterator)
597{
598    ASSERT_ARG(iterator, iterator);
599
600    LineBreakIteratorPool::sharedPool().put(iterator);
601}
602
603static TextBreakIterator* nonSharedCharacterBreakIterator;
604
605static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
606{
607    DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
608    MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
609    if (nonSharedCharacterBreakIterator != expected)
610        return false;
611    nonSharedCharacterBreakIterator = newValue;
612    return true;
613}
614
615NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
616    : m_is8Bit(true)
617    , m_charaters8(0)
618    , m_offset(0)
619    , m_length(0)
620    , m_iterator(0)
621{
622    if (string.isEmpty())
623        return;
624
625    m_is8Bit = string.is8Bit();
626
627    if (m_is8Bit) {
628        m_charaters8 = string.characters8();
629        m_offset = 0;
630        m_length = string.length();
631        return;
632    }
633
634    createIteratorForBuffer(string.characters16(), string.length());
635}
636
637NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
638    : m_is8Bit(false)
639    , m_charaters8(0)
640    , m_offset(0)
641    , m_length(0)
642    , m_iterator(0)
643{
644    createIteratorForBuffer(buffer, length);
645}
646
647void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
648{
649    m_iterator = nonSharedCharacterBreakIterator;
650    bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
651    if (!createdIterator) {
652        UErrorCode errorCode = U_ZERO_ERROR;
653        m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
654        ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
655    }
656
657    setText16(m_iterator, buffer, length);
658}
659
660NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
661{
662    if (m_is8Bit)
663        return;
664    if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
665        delete m_iterator;
666}
667
668int NonSharedCharacterBreakIterator::next()
669{
670    if (!m_is8Bit)
671        return m_iterator->next();
672
673    if (m_offset >= m_length)
674        return TextBreakDone;
675
676    m_offset += clusterLengthStartingAt(m_offset);
677    return m_offset;
678}
679
680int NonSharedCharacterBreakIterator::current()
681{
682    if (!m_is8Bit)
683        return m_iterator->current();
684    return m_offset;
685}
686
687bool NonSharedCharacterBreakIterator::isBreak(int offset) const
688{
689    if (!m_is8Bit)
690        return m_iterator->isBoundary(offset);
691    return !isLFAfterCR(offset);
692}
693
694int NonSharedCharacterBreakIterator::preceding(int offset) const
695{
696    if (!m_is8Bit)
697        return m_iterator->preceding(offset);
698    if (offset <= 0)
699        return TextBreakDone;
700    if (isLFAfterCR(offset))
701        return offset - 2;
702    return offset - 1;
703}
704
705int NonSharedCharacterBreakIterator::following(int offset) const
706{
707    if (!m_is8Bit)
708        return m_iterator->following(offset);
709    if (static_cast<unsigned>(offset) >= m_length)
710        return TextBreakDone;
711    return offset + clusterLengthStartingAt(offset);
712}
713
714TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
715{
716    UErrorCode openStatus = U_ZERO_ERROR;
717    static TextBreakIterator* iterator = 0;
718    if (!iterator) {
719        iterator =  icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
720        ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
721        if (!iterator)
722            return 0;
723    }
724
725    setText16(iterator, string, length);
726    return iterator;
727}
728
729bool isWordTextBreak(TextBreakIterator* iterator)
730{
731    icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
732    int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
733    return ruleStatus != UBRK_WORD_NONE;
734}
735
736static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
737{
738    if (!string)
739        return 0;
740
741    static TextBreakIterator* iterator = 0;
742    if (!iterator) {
743        UParseError parseStatus;
744        UErrorCode openStatus = U_ZERO_ERROR;
745        Vector<UChar> rules;
746        String(breakRules).appendTo(rules);
747
748        iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
749        ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
750        if (!iterator)
751            return 0;
752    }
753
754    setText16(iterator, string, length);
755    return iterator;
756}
757
758TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
759{
760    // This rule set is based on character-break iterator rules of ICU 4.0
761    // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
762    // The major differences from the original ones are listed below:
763    // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
764    // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
765    // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
766    // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
767    // * Added rules for regional indicator symbols.
768    static const char* const kRules =
769        "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
770        "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
771        "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
772        "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
773        "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
774        "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
775        "$L       = [\\p{Grapheme_Cluster_Break = L}];"
776        "$V       = [\\p{Grapheme_Cluster_Break = V}];"
777        "$T       = [\\p{Grapheme_Cluster_Break = T}];"
778        "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
779        "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
780        "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
781        "$HinV    = \\u094D;"              // Devanagari Sign Virama
782        "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
783        "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
784        "$BenV    = \\u09CD;"              // Bengali Sign Virama
785        "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
786        "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
787        "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
788        "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
789        "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
790        "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
791        "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
792        "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
793        "$OriV    = \\u0B4D;"              // Oriya Sign Virama
794        "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
795        "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
796        "$TelV    = \\u0C4D;"              // Telugu Sign Virama
797        "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
798        "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
799        "$KanV    = \\u0CCD;"              // Kannada Sign Virama
800        "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
801        "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
802        "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
803        "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
804        "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
805        "!!chain;"
806        "!!forward;"
807        "$CR $LF;"
808        "$L ($L | $V | $LV | $LVT);"
809        "($LV | $V) ($V | $T);"
810        "($LVT | $T) $T;"
811        "[^$Control $CR $LF] $Extend;"
812        "[^$Control $CR $LF] $SpacingMark;"
813        "$RI $RI / $RI;"
814        "$RI $RI;"
815        "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
816        "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
817        "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
818        "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
819        "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
820        "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
821        "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
822        "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
823        "!!reverse;"
824        "$LF $CR;"
825        "($L | $V | $LV | $LVT) $L;"
826        "($V | $T) ($LV | $V);"
827        "$T ($LVT | $T);"
828        "$Extend      [^$Control $CR $LF];"
829        "$SpacingMark [^$Control $CR $LF];"
830        "$RI $RI / $RI $RI;"
831        "$RI $RI;"
832        "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
833        "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
834        "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
835        "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
836        "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
837        "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
838        "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
839        "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
840        "!!safe_reverse;"
841        "!!safe_forward;";
842
843    return setUpIteratorWithRules(kRules, string, length);
844}
845
846}
847