1/*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "TextIterator.h"
29
30#include "Document.h"
31#include "Frame.h"
32#include "HTMLElement.h"
33#include "HTMLNames.h"
34#include "htmlediting.h"
35#include "InlineTextBox.h"
36#include "Range.h"
37#include "RenderTableCell.h"
38#include "RenderTableRow.h"
39#include "RenderTextControl.h"
40#include "RenderTextFragment.h"
41#include "TextBoundaries.h"
42#include "TextBreakIterator.h"
43#include "VisiblePosition.h"
44#include "visible_units.h"
45#include <wtf/unicode/CharacterNames.h>
46
47#if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48#include "TextBreakIteratorInternalICU.h"
49#include <unicode/usearch.h>
50#endif
51
52using namespace WTF::Unicode;
53using namespace std;
54
55namespace WebCore {
56
57using namespace HTMLNames;
58
59// Buffer that knows how to compare with a search target.
60// Keeps enough of the previous text to be able to search in the future, but no more.
61// Non-breaking spaces are always equal to normal spaces.
62// Case folding is also done if the CaseInsensitive option is specified.
63// Matches are further filtered if the AtWordStarts option is specified, although some
64// matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65class SearchBuffer {
66    WTF_MAKE_NONCOPYABLE(SearchBuffer);
67public:
68    SearchBuffer(const String& target, FindOptions);
69    ~SearchBuffer();
70
71    // Returns number of characters appended; guaranteed to be in the range [1, length].
72    size_t append(const UChar*, size_t length);
73    bool needsMoreContext() const;
74    void prependContext(const UChar*, size_t length);
75    void reachedBreak();
76
77    // Result is the size in characters of what was found.
78    // And <startOffset> is the number of characters back to the start of what was found.
79    size_t search(size_t& startOffset);
80    bool atBreak() const;
81
82#if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83
84private:
85    bool isBadMatch(const UChar*, size_t length) const;
86    bool isWordStartMatch(size_t start, size_t length) const;
87
88    String m_target;
89    FindOptions m_options;
90
91    Vector<UChar> m_buffer;
92    size_t m_overlap;
93    size_t m_prefixLength;
94    bool m_atBreak;
95    bool m_needsMoreContext;
96
97    bool m_targetRequiresKanaWorkaround;
98    Vector<UChar> m_normalizedTarget;
99    mutable Vector<UChar> m_normalizedMatch;
100
101#else
102
103private:
104    void append(UChar, bool isCharacterStart);
105    size_t length() const;
106
107    String m_target;
108    FindOptions m_options;
109
110    Vector<UChar> m_buffer;
111    Vector<bool> m_isCharacterStartBuffer;
112    bool m_isBufferFull;
113    size_t m_cursor;
114
115#endif
116};
117
118// --------
119
120static const unsigned bitsInWord = sizeof(unsigned) * 8;
121static const unsigned bitInWordMask = bitsInWord - 1;
122
123BitStack::BitStack()
124    : m_size(0)
125{
126}
127
128BitStack::~BitStack()
129{
130}
131
132void BitStack::push(bool bit)
133{
134    unsigned index = m_size / bitsInWord;
135    unsigned shift = m_size & bitInWordMask;
136    if (!shift && index == m_words.size()) {
137        m_words.grow(index + 1);
138        m_words[index] = 0;
139    }
140    unsigned& word = m_words[index];
141    unsigned mask = 1U << shift;
142    if (bit)
143        word |= mask;
144    else
145        word &= ~mask;
146    ++m_size;
147}
148
149void BitStack::pop()
150{
151    if (m_size)
152        --m_size;
153}
154
155bool BitStack::top() const
156{
157    if (!m_size)
158        return false;
159    unsigned shift = (m_size - 1) & bitInWordMask;
160    return m_words.last() & (1U << shift);
161}
162
163unsigned BitStack::size() const
164{
165    return m_size;
166}
167
168// --------
169
170#if !ASSERT_DISABLED
171
172static unsigned depthCrossingShadowBoundaries(Node* node)
173{
174    unsigned depth = 0;
175    for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176        ++depth;
177    return depth;
178}
179
180#endif
181
182// This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
183static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184{
185    if (!rangeEndContainer)
186        return 0;
187    if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188        if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189            return next;
190    }
191    for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192        if (Node* next = node->nextSibling())
193            return next;
194    }
195    return 0;
196}
197
198// --------
199
200static inline bool fullyClipsContents(Node* node)
201{
202    RenderObject* renderer = node->renderer();
203    if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204        return false;
205    return toRenderBox(renderer)->size().isEmpty();
206}
207
208static inline bool ignoresContainerClip(Node* node)
209{
210    RenderObject* renderer = node->renderer();
211    if (!renderer || renderer->isText())
212        return false;
213    EPosition position = renderer->style()->position();
214    return position == AbsolutePosition || position == FixedPosition;
215}
216
217static void pushFullyClippedState(BitStack& stack, Node* node)
218{
219    ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220
221    // Push true if this node full clips its contents, or if a parent already has fully
222    // clipped and this is not a node that ignores its container's clip.
223    stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224}
225
226static void setUpFullyClippedStack(BitStack& stack, Node* node)
227{
228    // Put the nodes in a vector so we can iterate in reverse order.
229    Vector<Node*, 100> ancestry;
230    for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231        ancestry.append(parent);
232
233    // Call pushFullyClippedState on each node starting with the earliest ancestor.
234    size_t size = ancestry.size();
235    for (size_t i = 0; i < size; ++i)
236        pushFullyClippedState(stack, ancestry[size - i - 1]);
237    pushFullyClippedState(stack, node);
238
239    ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240}
241
242// --------
243
244TextIterator::TextIterator()
245    : m_startContainer(0)
246    , m_startOffset(0)
247    , m_endContainer(0)
248    , m_endOffset(0)
249    , m_positionNode(0)
250    , m_textCharacters(0)
251    , m_textLength(0)
252    , m_remainingTextBox(0)
253    , m_firstLetterText(0)
254    , m_lastCharacter(0)
255    , m_emitsCharactersBetweenAllVisiblePositions(false)
256    , m_entersTextControls(false)
257    , m_emitsTextWithoutTranscoding(false)
258    , m_handledFirstLetter(false)
259    , m_ignoresStyleVisibility(false)
260    , m_emitsObjectReplacementCharacters(false)
261{
262}
263
264TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
265    : m_startContainer(0)
266    , m_startOffset(0)
267    , m_endContainer(0)
268    , m_endOffset(0)
269    , m_positionNode(0)
270    , m_textCharacters(0)
271    , m_textLength(0)
272    , m_remainingTextBox(0)
273    , m_firstLetterText(0)
274    , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
275    , m_entersTextControls(behavior & TextIteratorEntersTextControls)
276    , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
277    , m_handledFirstLetter(false)
278    , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
279    , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
280{
281    if (!r)
282        return;
283
284    // get and validate the range endpoints
285    Node* startContainer = r->startContainer();
286    if (!startContainer)
287        return;
288    int startOffset = r->startOffset();
289    Node* endContainer = r->endContainer();
290    int endOffset = r->endOffset();
291
292    // Callers should be handing us well-formed ranges. If we discover that this isn't
293    // the case, we could consider changing this assertion to an early return.
294    ASSERT(r->boundaryPointsValid());
295
296    // remember range - this does not change
297    m_startContainer = startContainer;
298    m_startOffset = startOffset;
299    m_endContainer = endContainer;
300    m_endOffset = endOffset;
301
302    // set up the current node for processing
303    m_node = r->firstNode();
304    if (!m_node)
305        return;
306    setUpFullyClippedStack(m_fullyClippedStack, m_node);
307    m_offset = m_node == m_startContainer ? m_startOffset : 0;
308    m_handledNode = false;
309    m_handledChildren = false;
310
311    // calculate first out of bounds node
312    m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
313
314    // initialize node processing state
315    m_needsAnotherNewline = false;
316    m_textBox = 0;
317
318    // initialize record of previous node processing
319    m_hasEmitted = false;
320    m_lastTextNode = 0;
321    m_lastTextNodeEndedWithCollapsedSpace = false;
322    m_lastCharacter = 0;
323
324#ifndef NDEBUG
325    // need this just because of the assert in advance()
326    m_positionNode = m_node;
327#endif
328
329    // identify the first run
330    advance();
331}
332
333TextIterator::~TextIterator()
334{
335}
336
337void TextIterator::advance()
338{
339    // reset the run information
340    m_positionNode = 0;
341    m_textLength = 0;
342
343    // handle remembered node that needed a newline after the text node's newline
344    if (m_needsAnotherNewline) {
345        // Emit the extra newline, and position it *inside* m_node, after m_node's
346        // contents, in case it's a block, in the same way that we position the first
347        // newline.  The range for the emitted newline should start where the line
348        // break begins.
349        // FIXME: It would be cleaner if we emitted two newlines during the last
350        // iteration, instead of using m_needsAnotherNewline.
351        Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
352        emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
353        m_needsAnotherNewline = false;
354        return;
355    }
356
357    if (!m_textBox && m_remainingTextBox) {
358        m_textBox = m_remainingTextBox;
359        m_remainingTextBox = 0;
360        m_firstLetterText = 0;
361        m_offset = 0;
362    }
363    // handle remembered text box
364    if (m_textBox) {
365        handleTextBox();
366        if (m_positionNode)
367            return;
368    }
369
370    while (m_node && m_node != m_pastEndNode) {
371        // if the range ends at offset 0 of an element, represent the
372        // position, but not the content, of that element e.g. if the
373        // node is a blockflow element, emit a newline that
374        // precedes the element
375        if (m_node == m_endContainer && m_endOffset == 0) {
376            representNodeOffsetZero();
377            m_node = 0;
378            return;
379        }
380
381        RenderObject* renderer = m_node->renderer();
382        if (!renderer) {
383            m_handledNode = true;
384            m_handledChildren = true;
385        } else {
386            // handle current node according to its type
387            if (!m_handledNode) {
388                if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
389                    m_handledNode = handleTextNode();
390                else if (renderer && (renderer->isImage() || renderer->isWidget() ||
391                         (renderer->node() && renderer->node()->isElementNode() &&
392                          static_cast<Element*>(renderer->node())->isFormControlElement())))
393                    m_handledNode = handleReplacedElement();
394                else
395                    m_handledNode = handleNonTextNode();
396                if (m_positionNode)
397                    return;
398            }
399        }
400
401        // find a new current node to handle in depth-first manner,
402        // calling exitNode() as we come back thru a parent node
403        Node* next = m_handledChildren ? 0 : m_node->firstChild();
404        m_offset = 0;
405        if (!next) {
406            next = m_node->nextSibling();
407            if (!next) {
408                bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
409                Node* parentNode = m_node->parentOrHostNode();
410                while (!next && parentNode) {
411                    if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
412                        return;
413                    bool haveRenderer = m_node->renderer();
414                    m_node = parentNode;
415                    m_fullyClippedStack.pop();
416                    parentNode = m_node->parentOrHostNode();
417                    if (haveRenderer)
418                        exitNode();
419                    if (m_positionNode) {
420                        m_handledNode = true;
421                        m_handledChildren = true;
422                        return;
423                    }
424                    next = m_node->nextSibling();
425                }
426            }
427            m_fullyClippedStack.pop();
428        }
429
430        // set the new current node
431        m_node = next;
432        if (m_node)
433            pushFullyClippedState(m_fullyClippedStack, m_node);
434        m_handledNode = false;
435        m_handledChildren = false;
436        m_handledFirstLetter = false;
437        m_firstLetterText = 0;
438
439        // how would this ever be?
440        if (m_positionNode)
441            return;
442    }
443}
444
445bool TextIterator::handleTextNode()
446{
447    if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
448        return false;
449
450    RenderText* renderer = toRenderText(m_node->renderer());
451
452    m_lastTextNode = m_node;
453    String str = renderer->text();
454
455    // handle pre-formatted text
456    if (!renderer->style()->collapseWhiteSpace()) {
457        int runStart = m_offset;
458        if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
459            emitCharacter(' ', m_node, 0, runStart, runStart);
460            return false;
461        }
462        if (!m_handledFirstLetter && renderer->isTextFragment()) {
463            handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
464            if (m_firstLetterText) {
465                String firstLetter = m_firstLetterText->text();
466                emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
467                m_firstLetterText = 0;
468                m_textBox = 0;
469                return false;
470            }
471        }
472        if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
473            return false;
474        int strLength = str.length();
475        int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
476        int runEnd = min(strLength, end);
477
478        if (runStart >= runEnd)
479            return true;
480
481        emitText(m_node, runStart, runEnd);
482        return true;
483    }
484
485    if (!renderer->firstTextBox() && str.length() > 0) {
486        if (!m_handledFirstLetter && renderer->isTextFragment()) {
487            handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
488            if (m_firstLetterText) {
489                handleTextBox();
490                return false;
491            }
492        }
493        if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
494            return false;
495        m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
496        return true;
497    }
498
499    // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
500    if (renderer->containsReversedText()) {
501        m_sortedTextBoxes.clear();
502        for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
503            m_sortedTextBoxes.append(textBox);
504        }
505        std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
506        m_sortedTextBoxesPosition = 0;
507    }
508
509    m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
510    if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
511        handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
512    handleTextBox();
513    return true;
514}
515
516void TextIterator::handleTextBox()
517{
518    RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
519    if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
520        m_textBox = 0;
521        return;
522    }
523    String str = renderer->text();
524    unsigned start = m_offset;
525    unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
526    while (m_textBox) {
527        unsigned textBoxStart = m_textBox->start();
528        unsigned runStart = max(textBoxStart, start);
529
530        // Check for collapsed space at the start of this run.
531        InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
532        bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
533            || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
534        if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
535            if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
536                unsigned spaceRunStart = runStart - 1;
537                while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
538                    --spaceRunStart;
539                emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
540            } else
541                emitCharacter(' ', m_node, 0, runStart, runStart);
542            return;
543        }
544        unsigned textBoxEnd = textBoxStart + m_textBox->len();
545        unsigned runEnd = min(textBoxEnd, end);
546
547        // Determine what the next text box will be, but don't advance yet
548        InlineTextBox* nextTextBox = 0;
549        if (renderer->containsReversedText()) {
550            if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
551                nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
552        } else
553            nextTextBox = m_textBox->nextTextBox();
554
555        if (runStart < runEnd) {
556            // Handle either a single newline character (which becomes a space),
557            // or a run of characters that does not include a newline.
558            // This effectively translates newlines to spaces without copying the text.
559            if (str[runStart] == '\n') {
560                emitCharacter(' ', m_node, 0, runStart, runStart + 1);
561                m_offset = runStart + 1;
562            } else {
563                size_t subrunEnd = str.find('\n', runStart);
564                if (subrunEnd == notFound || subrunEnd > runEnd)
565                    subrunEnd = runEnd;
566
567                m_offset = subrunEnd;
568                emitText(m_node, renderer, runStart, subrunEnd);
569            }
570
571            // If we are doing a subrun that doesn't go to the end of the text box,
572            // come back again to finish handling this text box; don't advance to the next one.
573            if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
574                return;
575
576            // Advance and return
577            unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
578            if (nextRunStart > runEnd)
579                m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
580            m_textBox = nextTextBox;
581            if (renderer->containsReversedText())
582                ++m_sortedTextBoxesPosition;
583            return;
584        }
585        // Advance and continue
586        m_textBox = nextTextBox;
587        if (renderer->containsReversedText())
588            ++m_sortedTextBoxesPosition;
589    }
590    if (!m_textBox && m_remainingTextBox) {
591        m_textBox = m_remainingTextBox;
592        m_remainingTextBox = 0;
593        m_firstLetterText = 0;
594        m_offset = 0;
595        handleTextBox();
596    }
597}
598
599void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
600{
601    if (renderer->firstLetter()) {
602        RenderObject* r = renderer->firstLetter();
603        if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
604            return;
605        for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
606            if (currChild->isText()) {
607                RenderText* firstLetter = toRenderText(currChild);
608                m_handledFirstLetter = true;
609                m_remainingTextBox = m_textBox;
610                m_textBox = firstLetter->firstTextBox();
611                m_firstLetterText = firstLetter;
612                return;
613            }
614        }
615    }
616    m_handledFirstLetter = true;
617}
618
619bool TextIterator::handleReplacedElement()
620{
621    if (m_fullyClippedStack.top())
622        return false;
623
624    RenderObject* renderer = m_node->renderer();
625    if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
626        return false;
627
628    if (m_lastTextNodeEndedWithCollapsedSpace) {
629        emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
630        return false;
631    }
632
633    if (m_entersTextControls && renderer->isTextControl()) {
634        if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
635            m_node = innerTextElement->shadowTreeRootNode();
636            pushFullyClippedState(m_fullyClippedStack, m_node);
637            m_offset = 0;
638            return false;
639        }
640    }
641
642    m_hasEmitted = true;
643
644    if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
645        emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
646        return true;
647    }
648
649    if (m_emitsCharactersBetweenAllVisiblePositions) {
650        // We want replaced elements to behave like punctuation for boundary
651        // finding, and to simply take up space for the selection preservation
652        // code in moveParagraphs, so we use a comma.
653        emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
654        return true;
655    }
656
657    m_positionNode = m_node->parentNode();
658    m_positionOffsetBaseNode = m_node;
659    m_positionStartOffset = 0;
660    m_positionEndOffset = 1;
661
662    m_textCharacters = 0;
663    m_textLength = 0;
664
665    m_lastCharacter = 0;
666
667    return true;
668}
669
670bool TextIterator::hasVisibleTextNode(RenderText* renderer)
671{
672    if (renderer->style()->visibility() == VISIBLE)
673        return true;
674    if (renderer->isTextFragment()) {
675        RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
676        if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
677            return true;
678    }
679    return false;
680}
681
682static bool shouldEmitTabBeforeNode(Node* node)
683{
684    RenderObject* r = node->renderer();
685
686    // Table cells are delimited by tabs.
687    if (!r || !isTableCell(node))
688        return false;
689
690    // Want a tab before every cell other than the first one
691    RenderTableCell* rc = toRenderTableCell(r);
692    RenderTable* t = rc->table();
693    return t && (t->cellBefore(rc) || t->cellAbove(rc));
694}
695
696static bool shouldEmitNewlineForNode(Node* node)
697{
698    // br elements are represented by a single newline.
699    RenderObject* r = node->renderer();
700    if (!r)
701        return node->hasTagName(brTag);
702
703    return r->isBR();
704}
705
706static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
707{
708    // Block flow (versus inline flow) is represented by having
709    // a newline both before and after the element.
710    RenderObject* r = node->renderer();
711    if (!r) {
712        return (node->hasTagName(blockquoteTag)
713                || node->hasTagName(ddTag)
714                || node->hasTagName(divTag)
715                || node->hasTagName(dlTag)
716                || node->hasTagName(dtTag)
717                || node->hasTagName(h1Tag)
718                || node->hasTagName(h2Tag)
719                || node->hasTagName(h3Tag)
720                || node->hasTagName(h4Tag)
721                || node->hasTagName(h5Tag)
722                || node->hasTagName(h6Tag)
723                || node->hasTagName(hrTag)
724                || node->hasTagName(liTag)
725                || node->hasTagName(listingTag)
726                || node->hasTagName(olTag)
727                || node->hasTagName(pTag)
728                || node->hasTagName(preTag)
729                || node->hasTagName(trTag)
730                || node->hasTagName(ulTag));
731    }
732
733    // Need to make an exception for table cells, because they are blocks, but we
734    // want them tab-delimited rather than having newlines before and after.
735    if (isTableCell(node))
736        return false;
737
738    // Need to make an exception for table row elements, because they are neither
739    // "inline" or "RenderBlock", but we want newlines for them.
740    if (r->isTableRow()) {
741        RenderTable* t = toRenderTableRow(r)->table();
742        if (t && !t->isInline())
743            return true;
744    }
745
746    return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
747}
748
749static bool shouldEmitNewlineAfterNode(Node* node)
750{
751    // FIXME: It should be better but slower to create a VisiblePosition here.
752    if (!shouldEmitNewlinesBeforeAndAfterNode(node))
753        return false;
754    // Check if this is the very last renderer in the document.
755    // If so, then we should not emit a newline.
756    while ((node = node->traverseNextSibling()))
757        if (node->renderer())
758            return true;
759    return false;
760}
761
762static bool shouldEmitNewlineBeforeNode(Node* node)
763{
764    return shouldEmitNewlinesBeforeAndAfterNode(node);
765}
766
767static bool shouldEmitExtraNewlineForNode(Node* node)
768{
769    // When there is a significant collapsed bottom margin, emit an extra
770    // newline for a more realistic result.  We end up getting the right
771    // result even without margin collapsing. For example: <div><p>text</p></div>
772    // will work right even if both the <div> and the <p> have bottom margins.
773    RenderObject* r = node->renderer();
774    if (!r || !r->isBox())
775        return false;
776
777    // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
778    // not to do this at all
779    if (node->hasTagName(h1Tag)
780        || node->hasTagName(h2Tag)
781        || node->hasTagName(h3Tag)
782        || node->hasTagName(h4Tag)
783        || node->hasTagName(h5Tag)
784        || node->hasTagName(h6Tag)
785        || node->hasTagName(pTag)) {
786        RenderStyle* style = r->style();
787        if (style) {
788            int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
789            int fontSize = style->fontDescription().computedPixelSize();
790            if (bottomMargin * 2 >= fontSize)
791                return true;
792        }
793    }
794
795    return false;
796}
797
798static int collapsedSpaceLength(RenderText* renderer, int textEnd)
799{
800    const UChar* characters = renderer->text()->characters();
801    int length = renderer->text()->length();
802    for (int i = textEnd; i < length; ++i) {
803        if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
804            return i - textEnd;
805    }
806
807    return length - textEnd;
808}
809
810static int maxOffsetIncludingCollapsedSpaces(Node* node)
811{
812    int offset = caretMaxOffset(node);
813
814    if (node->renderer() && node->renderer()->isText())
815        offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
816
817    return offset;
818}
819
820// Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
821bool TextIterator::shouldRepresentNodeOffsetZero()
822{
823    if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
824        return true;
825
826    // Leave element positioned flush with start of a paragraph
827    // (e.g. do not insert tab before a table cell at the start of a paragraph)
828    if (m_lastCharacter == '\n')
829        return false;
830
831    // Otherwise, show the position if we have emitted any characters
832    if (m_hasEmitted)
833        return true;
834
835    // We've not emitted anything yet. Generally, there is no need for any positioning then.
836    // The only exception is when the element is visually not in the same line as
837    // the start of the range (e.g. the range starts at the end of the previous paragraph).
838    // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
839    // make quicker checks to possibly avoid that. Another check that we could make is
840    // is whether the inline vs block flow changed since the previous visible element.
841    // I think we're already in a special enough case that that won't be needed, tho.
842
843    // No character needed if this is the first node in the range.
844    if (m_node == m_startContainer)
845        return false;
846
847    // If we are outside the start container's subtree, assume we need to emit.
848    // FIXME: m_startContainer could be an inline block
849    if (!m_node->isDescendantOf(m_startContainer))
850        return true;
851
852    // If we started as m_startContainer offset 0 and the current node is a descendant of
853    // the start container, we already had enough context to correctly decide whether to
854    // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
855    // so don't second guess that now.
856    // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
857    // immaterial since we likely would have already emitted something by now.
858    if (m_startOffset == 0)
859        return false;
860
861    // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
862    // Additionally, if the range we are iterating over contains huge sections of unrendered content,
863    // we would create VisiblePositions on every call to this function without this check.
864    if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
865        return false;
866
867    // The startPos.isNotNull() check is needed because the start could be before the body,
868    // and in that case we'll get null. We don't want to put in newlines at the start in that case.
869    // The currPos.isNotNull() check is needed because positions in non-HTML content
870    // (like SVG) do not have visible positions, and we don't want to emit for them either.
871    VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
872    VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
873    return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
874}
875
876bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
877{
878    return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
879}
880
881void TextIterator::representNodeOffsetZero()
882{
883    // Emit a character to show the positioning of m_node.
884
885    // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
886    // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
887    // on m_node to see if it necessitates emitting a character first and will early return
888    // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
889    if (shouldEmitTabBeforeNode(m_node)) {
890        if (shouldRepresentNodeOffsetZero())
891            emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
892    } else if (shouldEmitNewlineBeforeNode(m_node)) {
893        if (shouldRepresentNodeOffsetZero())
894            emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
895    } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
896        if (shouldRepresentNodeOffsetZero())
897            emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
898    }
899}
900
901bool TextIterator::handleNonTextNode()
902{
903    if (shouldEmitNewlineForNode(m_node))
904        emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
905    else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
906        emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
907    else
908        representNodeOffsetZero();
909
910    return true;
911}
912
913void TextIterator::exitNode()
914{
915    // prevent emitting a newline when exiting a collapsed block at beginning of the range
916    // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
917    // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
918    // therefore look like a blank line.
919    if (!m_hasEmitted)
920        return;
921
922    // Emit with a position *inside* m_node, after m_node's contents, in
923    // case it is a block, because the run should start where the
924    // emitted character is positioned visually.
925    Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
926    // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
927    // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
928    // TextIterator in _web_attributedStringFromRange.
929    // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
930    if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
931        // use extra newline to represent margin bottom, as needed
932        bool addNewline = shouldEmitExtraNewlineForNode(m_node);
933
934        // FIXME: We need to emit a '\n' as we leave an empty block(s) that
935        // contain a VisiblePosition when doing selection preservation.
936        if (m_lastCharacter != '\n') {
937            // insert a newline with a position following this block's contents.
938            emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
939            // remember whether to later add a newline for the current node
940            ASSERT(!m_needsAnotherNewline);
941            m_needsAnotherNewline = addNewline;
942        } else if (addNewline)
943            // insert a newline with a position following this block's contents.
944            emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
945    }
946
947    // If nothing was emitted, see if we need to emit a space.
948    if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
949        emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
950}
951
952void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
953{
954    m_hasEmitted = true;
955
956    // remember information with which to construct the TextIterator::range()
957    // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
958    m_positionNode = textNode;
959    m_positionOffsetBaseNode = offsetBaseNode;
960    m_positionStartOffset = textStartOffset;
961    m_positionEndOffset = textEndOffset;
962
963    // remember information with which to construct the TextIterator::characters() and length()
964    m_singleCharacterBuffer = c;
965    m_textCharacters = &m_singleCharacterBuffer;
966    m_textLength = 1;
967
968    // remember some iteration state
969    m_lastTextNodeEndedWithCollapsedSpace = false;
970    m_lastCharacter = c;
971}
972
973void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
974{
975    RenderText* renderer = toRenderText(renderObject);
976    m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
977    ASSERT(m_text.characters());
978
979    m_positionNode = textNode;
980    m_positionOffsetBaseNode = 0;
981    m_positionStartOffset = textStartOffset;
982    m_positionEndOffset = textEndOffset;
983    m_textCharacters = m_text.characters() + textStartOffset;
984    m_textLength = textEndOffset - textStartOffset;
985    m_lastCharacter = m_text[textEndOffset - 1];
986
987    m_lastTextNodeEndedWithCollapsedSpace = false;
988    m_hasEmitted = true;
989}
990
991void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
992{
993    emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
994}
995
996PassRefPtr<Range> TextIterator::range() const
997{
998    // use the current run information, if we have it
999    if (m_positionNode) {
1000        if (m_positionOffsetBaseNode) {
1001            int index = m_positionOffsetBaseNode->nodeIndex();
1002            m_positionStartOffset += index;
1003            m_positionEndOffset += index;
1004            m_positionOffsetBaseNode = 0;
1005        }
1006        return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1007    }
1008
1009    // otherwise, return the end of the overall range we were given
1010    if (m_endContainer)
1011        return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1012
1013    return 0;
1014}
1015
1016Node* TextIterator::node() const
1017{
1018    RefPtr<Range> textRange = range();
1019    if (!textRange)
1020        return 0;
1021
1022    Node* node = textRange->startContainer();
1023    if (!node)
1024        return 0;
1025    if (node->offsetInCharacters())
1026        return node;
1027
1028    return node->childNode(textRange->startOffset());
1029}
1030
1031// --------
1032
1033SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1034    : m_behavior(TextIteratorDefaultBehavior)
1035    , m_node(0)
1036    , m_positionNode(0)
1037{
1038}
1039
1040SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1041    : m_behavior(behavior)
1042    , m_node(0)
1043    , m_positionNode(0)
1044{
1045    ASSERT(m_behavior == TextIteratorDefaultBehavior);
1046
1047    if (!r)
1048        return;
1049
1050    Node* startNode = r->startContainer();
1051    if (!startNode)
1052        return;
1053    Node* endNode = r->endContainer();
1054    int startOffset = r->startOffset();
1055    int endOffset = r->endOffset();
1056
1057    if (!startNode->offsetInCharacters()) {
1058        if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1059            startNode = startNode->childNode(startOffset);
1060            startOffset = 0;
1061        }
1062    }
1063    if (!endNode->offsetInCharacters()) {
1064        if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1065            endNode = endNode->childNode(endOffset - 1);
1066            endOffset = lastOffsetInNode(endNode);
1067        }
1068    }
1069
1070    m_node = endNode;
1071    setUpFullyClippedStack(m_fullyClippedStack, m_node);
1072    m_offset = endOffset;
1073    m_handledNode = false;
1074    m_handledChildren = endOffset == 0;
1075
1076    m_startNode = startNode;
1077    m_startOffset = startOffset;
1078    m_endNode = endNode;
1079    m_endOffset = endOffset;
1080
1081#ifndef NDEBUG
1082    // Need this just because of the assert.
1083    m_positionNode = endNode;
1084#endif
1085
1086    m_lastTextNode = 0;
1087    m_lastCharacter = '\n';
1088
1089    m_havePassedStartNode = false;
1090
1091    advance();
1092}
1093
1094void SimplifiedBackwardsTextIterator::advance()
1095{
1096    ASSERT(m_positionNode);
1097
1098    m_positionNode = 0;
1099    m_textLength = 0;
1100
1101    while (m_node && !m_havePassedStartNode) {
1102        // Don't handle node if we start iterating at [node, 0].
1103        if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1104            RenderObject* renderer = m_node->renderer();
1105            if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1106                // FIXME: What about CDATA_SECTION_NODE?
1107                if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1108                    m_handledNode = handleTextNode();
1109            } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1110                if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1111                    m_handledNode = handleReplacedElement();
1112            } else
1113                m_handledNode = handleNonTextNode();
1114            if (m_positionNode)
1115                return;
1116        }
1117
1118        if (!m_handledChildren && m_node->hasChildNodes()) {
1119            m_node = m_node->lastChild();
1120            pushFullyClippedState(m_fullyClippedStack, m_node);
1121        } else {
1122            // Exit empty containers as we pass over them or containers
1123            // where [container, 0] is where we started iterating.
1124            if (!m_handledNode
1125                    && canHaveChildrenForEditing(m_node)
1126                    && m_node->parentNode()
1127                    && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1128                exitNode();
1129                if (m_positionNode) {
1130                    m_handledNode = true;
1131                    m_handledChildren = true;
1132                    return;
1133                }
1134            }
1135
1136            // Exit all other containers.
1137            while (!m_node->previousSibling()) {
1138                if (!advanceRespectingRange(m_node->parentOrHostNode()))
1139                    break;
1140                m_fullyClippedStack.pop();
1141                exitNode();
1142                if (m_positionNode) {
1143                    m_handledNode = true;
1144                    m_handledChildren = true;
1145                    return;
1146                }
1147            }
1148
1149            m_fullyClippedStack.pop();
1150            if (advanceRespectingRange(m_node->previousSibling()))
1151                pushFullyClippedState(m_fullyClippedStack, m_node);
1152            else
1153                m_node = 0;
1154        }
1155
1156        // For the purpose of word boundary detection,
1157        // we should iterate all visible text and trailing (collapsed) whitespaces.
1158        m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1159        m_handledNode = false;
1160        m_handledChildren = false;
1161
1162        if (m_positionNode)
1163            return;
1164    }
1165}
1166
1167bool SimplifiedBackwardsTextIterator::handleTextNode()
1168{
1169    m_lastTextNode = m_node;
1170
1171    RenderText* renderer = toRenderText(m_node->renderer());
1172    String str = renderer->text();
1173
1174    if (!renderer->firstTextBox() && str.length() > 0)
1175        return true;
1176
1177    m_positionEndOffset = m_offset;
1178
1179    m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1180    m_positionNode = m_node;
1181    m_positionStartOffset = m_offset;
1182    m_textLength = m_positionEndOffset - m_positionStartOffset;
1183    m_textCharacters = str.characters() + m_positionStartOffset;
1184
1185    m_lastCharacter = str[m_positionEndOffset - 1];
1186
1187    return true;
1188}
1189
1190bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1191{
1192    unsigned index = m_node->nodeIndex();
1193    // We want replaced elements to behave like punctuation for boundary
1194    // finding, and to simply take up space for the selection preservation
1195    // code in moveParagraphs, so we use a comma.  Unconditionally emit
1196    // here because this iterator is only used for boundary finding.
1197    emitCharacter(',', m_node->parentNode(), index, index + 1);
1198    return true;
1199}
1200
1201bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1202{
1203    // We can use a linefeed in place of a tab because this simple iterator is only used to
1204    // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1205    if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1206        unsigned index = m_node->nodeIndex();
1207        // The start of this emitted range is wrong. Ensuring correctness would require
1208        // VisiblePositions and so would be slow. previousBoundary expects this.
1209        emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1210    }
1211    return true;
1212}
1213
1214void SimplifiedBackwardsTextIterator::exitNode()
1215{
1216    if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1217        // The start of this emitted range is wrong. Ensuring correctness would require
1218        // VisiblePositions and so would be slow. previousBoundary expects this.
1219        emitCharacter('\n', m_node, 0, 0);
1220    }
1221}
1222
1223void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1224{
1225    m_singleCharacterBuffer = c;
1226    m_positionNode = node;
1227    m_positionStartOffset = startOffset;
1228    m_positionEndOffset = endOffset;
1229    m_textCharacters = &m_singleCharacterBuffer;
1230    m_textLength = 1;
1231    m_lastCharacter = c;
1232}
1233
1234bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1235{
1236    if (!next)
1237        return false;
1238    m_havePassedStartNode |= m_node == m_startNode;
1239    if (m_havePassedStartNode)
1240        return false;
1241    m_node = next;
1242    return true;
1243}
1244
1245PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1246{
1247    if (m_positionNode)
1248        return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1249
1250    return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1251}
1252
1253// --------
1254
1255CharacterIterator::CharacterIterator()
1256    : m_offset(0)
1257    , m_runOffset(0)
1258    , m_atBreak(true)
1259{
1260}
1261
1262CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1263    : m_offset(0)
1264    , m_runOffset(0)
1265    , m_atBreak(true)
1266    , m_textIterator(r, behavior)
1267{
1268    while (!atEnd() && m_textIterator.length() == 0)
1269        m_textIterator.advance();
1270}
1271
1272PassRefPtr<Range> CharacterIterator::range() const
1273{
1274    RefPtr<Range> r = m_textIterator.range();
1275    if (!m_textIterator.atEnd()) {
1276        if (m_textIterator.length() <= 1) {
1277            ASSERT(m_runOffset == 0);
1278        } else {
1279            Node* n = r->startContainer();
1280            ASSERT(n == r->endContainer());
1281            int offset = r->startOffset() + m_runOffset;
1282            ExceptionCode ec = 0;
1283            r->setStart(n, offset, ec);
1284            r->setEnd(n, offset + 1, ec);
1285            ASSERT(!ec);
1286        }
1287    }
1288    return r.release();
1289}
1290
1291void CharacterIterator::advance(int count)
1292{
1293    if (count <= 0) {
1294        ASSERT(count == 0);
1295        return;
1296    }
1297
1298    m_atBreak = false;
1299
1300    // easy if there is enough left in the current m_textIterator run
1301    int remaining = m_textIterator.length() - m_runOffset;
1302    if (count < remaining) {
1303        m_runOffset += count;
1304        m_offset += count;
1305        return;
1306    }
1307
1308    // exhaust the current m_textIterator run
1309    count -= remaining;
1310    m_offset += remaining;
1311
1312    // move to a subsequent m_textIterator run
1313    for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1314        int runLength = m_textIterator.length();
1315        if (runLength == 0)
1316            m_atBreak = true;
1317        else {
1318            // see whether this is m_textIterator to use
1319            if (count < runLength) {
1320                m_runOffset = count;
1321                m_offset += count;
1322                return;
1323            }
1324
1325            // exhaust this m_textIterator run
1326            count -= runLength;
1327            m_offset += runLength;
1328        }
1329    }
1330
1331    // ran to the end of the m_textIterator... no more runs left
1332    m_atBreak = true;
1333    m_runOffset = 0;
1334}
1335
1336String CharacterIterator::string(int numChars)
1337{
1338    Vector<UChar> result;
1339    result.reserveInitialCapacity(numChars);
1340    while (numChars > 0 && !atEnd()) {
1341        int runSize = min(numChars, length());
1342        result.append(characters(), runSize);
1343        numChars -= runSize;
1344        advance(runSize);
1345    }
1346    return String::adopt(result);
1347}
1348
1349static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1350{
1351    it.advance(offset);
1352    RefPtr<Range> start = it.range();
1353
1354    if (length > 1)
1355        it.advance(length - 1);
1356    RefPtr<Range> end = it.range();
1357
1358    return Range::create(start->startContainer()->document(),
1359        start->startContainer(), start->startOffset(),
1360        end->endContainer(), end->endOffset());
1361}
1362
1363BackwardsCharacterIterator::BackwardsCharacterIterator()
1364    : m_offset(0)
1365    , m_runOffset(0)
1366    , m_atBreak(true)
1367{
1368}
1369
1370BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1371    : m_offset(0)
1372    , m_runOffset(0)
1373    , m_atBreak(true)
1374    , m_textIterator(range, behavior)
1375{
1376    while (!atEnd() && !m_textIterator.length())
1377        m_textIterator.advance();
1378}
1379
1380PassRefPtr<Range> BackwardsCharacterIterator::range() const
1381{
1382    RefPtr<Range> r = m_textIterator.range();
1383    if (!m_textIterator.atEnd()) {
1384        if (m_textIterator.length() <= 1)
1385            ASSERT(m_runOffset == 0);
1386        else {
1387            Node* n = r->startContainer();
1388            ASSERT(n == r->endContainer());
1389            int offset = r->endOffset() - m_runOffset;
1390            ExceptionCode ec = 0;
1391            r->setStart(n, offset - 1, ec);
1392            r->setEnd(n, offset, ec);
1393            ASSERT(!ec);
1394        }
1395    }
1396    return r.release();
1397}
1398
1399void BackwardsCharacterIterator::advance(int count)
1400{
1401    if (count <= 0) {
1402        ASSERT(!count);
1403        return;
1404    }
1405
1406    m_atBreak = false;
1407
1408    int remaining = m_textIterator.length() - m_runOffset;
1409    if (count < remaining) {
1410        m_runOffset += count;
1411        m_offset += count;
1412        return;
1413    }
1414
1415    count -= remaining;
1416    m_offset += remaining;
1417
1418    for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1419        int runLength = m_textIterator.length();
1420        if (runLength == 0)
1421            m_atBreak = true;
1422        else {
1423            if (count < runLength) {
1424                m_runOffset = count;
1425                m_offset += count;
1426                return;
1427            }
1428
1429            count -= runLength;
1430            m_offset += runLength;
1431        }
1432    }
1433
1434    m_atBreak = true;
1435    m_runOffset = 0;
1436}
1437
1438// --------
1439
1440WordAwareIterator::WordAwareIterator()
1441    : m_previousText(0)
1442    , m_didLookAhead(false)
1443{
1444}
1445
1446WordAwareIterator::WordAwareIterator(const Range* r)
1447    : m_previousText(0)
1448    , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1449    , m_textIterator(r)
1450{
1451    advance(); // get in position over the first chunk of text
1452}
1453
1454WordAwareIterator::~WordAwareIterator()
1455{
1456}
1457
1458// We're always in one of these modes:
1459// - The current chunk in the text iterator is our current chunk
1460//      (typically its a piece of whitespace, or text that ended with whitespace)
1461// - The previous chunk in the text iterator is our current chunk
1462//      (we looked ahead to the next chunk and found a word boundary)
1463// - We built up our own chunk of text from many chunks from the text iterator
1464
1465// FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1466
1467void WordAwareIterator::advance()
1468{
1469    m_previousText = 0;
1470    m_buffer.clear();      // toss any old buffer we built up
1471
1472    // If last time we did a look-ahead, start with that looked-ahead chunk now
1473    if (!m_didLookAhead) {
1474        ASSERT(!m_textIterator.atEnd());
1475        m_textIterator.advance();
1476    }
1477    m_didLookAhead = false;
1478
1479    // Go to next non-empty chunk
1480    while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1481        m_textIterator.advance();
1482    m_range = m_textIterator.range();
1483
1484    if (m_textIterator.atEnd())
1485        return;
1486
1487    while (1) {
1488        // If this chunk ends in whitespace we can just use it as our chunk.
1489        if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1490            return;
1491
1492        // If this is the first chunk that failed, save it in previousText before look ahead
1493        if (m_buffer.isEmpty()) {
1494            m_previousText = m_textIterator.characters();
1495            m_previousLength = m_textIterator.length();
1496        }
1497
1498        // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1499        m_textIterator.advance();
1500        if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1501            m_didLookAhead = true;
1502            return;
1503        }
1504
1505        if (m_buffer.isEmpty()) {
1506            // Start gobbling chunks until we get to a suitable stopping point
1507            m_buffer.append(m_previousText, m_previousLength);
1508            m_previousText = 0;
1509        }
1510        m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1511        int exception = 0;
1512        m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1513    }
1514}
1515
1516int WordAwareIterator::length() const
1517{
1518    if (!m_buffer.isEmpty())
1519        return m_buffer.size();
1520    if (m_previousText)
1521        return m_previousLength;
1522    return m_textIterator.length();
1523}
1524
1525const UChar* WordAwareIterator::characters() const
1526{
1527    if (!m_buffer.isEmpty())
1528        return m_buffer.data();
1529    if (m_previousText)
1530        return m_previousText;
1531    return m_textIterator.characters();
1532}
1533
1534// --------
1535
1536static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1537{
1538    switch (c) {
1539        case hebrewPunctuationGershayim:
1540        case leftDoubleQuotationMark:
1541        case rightDoubleQuotationMark:
1542            return '"';
1543        case hebrewPunctuationGeresh:
1544        case leftSingleQuotationMark:
1545        case rightSingleQuotationMark:
1546            return '\'';
1547        case softHyphen:
1548            // Replace soft hyphen with an ignorable character so that their presence or absence will
1549            // not affect string comparison.
1550            return 0;
1551        default:
1552            return c;
1553    }
1554}
1555
1556static inline void foldQuoteMarksAndSoftHyphens(String& s)
1557{
1558    s.replace(hebrewPunctuationGeresh, '\'');
1559    s.replace(hebrewPunctuationGershayim, '"');
1560    s.replace(leftDoubleQuotationMark, '"');
1561    s.replace(leftSingleQuotationMark, '\'');
1562    s.replace(rightDoubleQuotationMark, '"');
1563    s.replace(rightSingleQuotationMark, '\'');
1564    // Replace soft hyphen with an ignorable character so that their presence or absence will
1565    // not affect string comparison.
1566    s.replace(softHyphen, 0);
1567}
1568
1569#if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1570
1571static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1572{
1573    for (size_t i = 0; i < length; ++i)
1574        data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1575}
1576
1577static const size_t minimumSearchBufferSize = 8192;
1578
1579#ifndef NDEBUG
1580static bool searcherInUse;
1581#endif
1582
1583static UStringSearch* createSearcher()
1584{
1585    // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1586    // but it doesn't matter exactly what it is, since we don't perform any searches
1587    // without setting both the pattern and the text.
1588    UErrorCode status = U_ZERO_ERROR;
1589    UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1590    ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1591    return searcher;
1592}
1593
1594static UStringSearch* searcher()
1595{
1596    static UStringSearch* searcher = createSearcher();
1597    return searcher;
1598}
1599
1600static inline void lockSearcher()
1601{
1602#ifndef NDEBUG
1603    ASSERT(!searcherInUse);
1604    searcherInUse = true;
1605#endif
1606}
1607
1608static inline void unlockSearcher()
1609{
1610#ifndef NDEBUG
1611    ASSERT(searcherInUse);
1612    searcherInUse = false;
1613#endif
1614}
1615
1616// ICU's search ignores the distinction between small kana letters and ones
1617// that are not small, and also characters that differ only in the voicing
1618// marks when considering only primary collation strength diffrences.
1619// This is not helpful for end users, since these differences make words
1620// distinct, so for our purposes we need these to be considered.
1621// The Unicode folks do not think the collation algorithm should be
1622// changed. To work around this, we would like to tailor the ICU searcher,
1623// but we can't get that to work yet. So instead, we check for cases where
1624// these differences occur, and skip those matches.
1625
1626// We refer to the above technique as the "kana workaround". The next few
1627// functions are helper functinos for the kana workaround.
1628
1629static inline bool isKanaLetter(UChar character)
1630{
1631    // Hiragana letters.
1632    if (character >= 0x3041 && character <= 0x3096)
1633        return true;
1634
1635    // Katakana letters.
1636    if (character >= 0x30A1 && character <= 0x30FA)
1637        return true;
1638    if (character >= 0x31F0 && character <= 0x31FF)
1639        return true;
1640
1641    // Halfwidth katakana letters.
1642    if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1643        return true;
1644
1645    return false;
1646}
1647
1648static inline bool isSmallKanaLetter(UChar character)
1649{
1650    ASSERT(isKanaLetter(character));
1651
1652    switch (character) {
1653    case 0x3041: // HIRAGANA LETTER SMALL A
1654    case 0x3043: // HIRAGANA LETTER SMALL I
1655    case 0x3045: // HIRAGANA LETTER SMALL U
1656    case 0x3047: // HIRAGANA LETTER SMALL E
1657    case 0x3049: // HIRAGANA LETTER SMALL O
1658    case 0x3063: // HIRAGANA LETTER SMALL TU
1659    case 0x3083: // HIRAGANA LETTER SMALL YA
1660    case 0x3085: // HIRAGANA LETTER SMALL YU
1661    case 0x3087: // HIRAGANA LETTER SMALL YO
1662    case 0x308E: // HIRAGANA LETTER SMALL WA
1663    case 0x3095: // HIRAGANA LETTER SMALL KA
1664    case 0x3096: // HIRAGANA LETTER SMALL KE
1665    case 0x30A1: // KATAKANA LETTER SMALL A
1666    case 0x30A3: // KATAKANA LETTER SMALL I
1667    case 0x30A5: // KATAKANA LETTER SMALL U
1668    case 0x30A7: // KATAKANA LETTER SMALL E
1669    case 0x30A9: // KATAKANA LETTER SMALL O
1670    case 0x30C3: // KATAKANA LETTER SMALL TU
1671    case 0x30E3: // KATAKANA LETTER SMALL YA
1672    case 0x30E5: // KATAKANA LETTER SMALL YU
1673    case 0x30E7: // KATAKANA LETTER SMALL YO
1674    case 0x30EE: // KATAKANA LETTER SMALL WA
1675    case 0x30F5: // KATAKANA LETTER SMALL KA
1676    case 0x30F6: // KATAKANA LETTER SMALL KE
1677    case 0x31F0: // KATAKANA LETTER SMALL KU
1678    case 0x31F1: // KATAKANA LETTER SMALL SI
1679    case 0x31F2: // KATAKANA LETTER SMALL SU
1680    case 0x31F3: // KATAKANA LETTER SMALL TO
1681    case 0x31F4: // KATAKANA LETTER SMALL NU
1682    case 0x31F5: // KATAKANA LETTER SMALL HA
1683    case 0x31F6: // KATAKANA LETTER SMALL HI
1684    case 0x31F7: // KATAKANA LETTER SMALL HU
1685    case 0x31F8: // KATAKANA LETTER SMALL HE
1686    case 0x31F9: // KATAKANA LETTER SMALL HO
1687    case 0x31FA: // KATAKANA LETTER SMALL MU
1688    case 0x31FB: // KATAKANA LETTER SMALL RA
1689    case 0x31FC: // KATAKANA LETTER SMALL RI
1690    case 0x31FD: // KATAKANA LETTER SMALL RU
1691    case 0x31FE: // KATAKANA LETTER SMALL RE
1692    case 0x31FF: // KATAKANA LETTER SMALL RO
1693    case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1694    case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1695    case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1696    case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1697    case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1698    case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1699    case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1700    case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1701    case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1702        return true;
1703    }
1704    return false;
1705}
1706
1707enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1708
1709static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1710{
1711    ASSERT(isKanaLetter(character));
1712
1713    switch (character) {
1714    case 0x304C: // HIRAGANA LETTER GA
1715    case 0x304E: // HIRAGANA LETTER GI
1716    case 0x3050: // HIRAGANA LETTER GU
1717    case 0x3052: // HIRAGANA LETTER GE
1718    case 0x3054: // HIRAGANA LETTER GO
1719    case 0x3056: // HIRAGANA LETTER ZA
1720    case 0x3058: // HIRAGANA LETTER ZI
1721    case 0x305A: // HIRAGANA LETTER ZU
1722    case 0x305C: // HIRAGANA LETTER ZE
1723    case 0x305E: // HIRAGANA LETTER ZO
1724    case 0x3060: // HIRAGANA LETTER DA
1725    case 0x3062: // HIRAGANA LETTER DI
1726    case 0x3065: // HIRAGANA LETTER DU
1727    case 0x3067: // HIRAGANA LETTER DE
1728    case 0x3069: // HIRAGANA LETTER DO
1729    case 0x3070: // HIRAGANA LETTER BA
1730    case 0x3073: // HIRAGANA LETTER BI
1731    case 0x3076: // HIRAGANA LETTER BU
1732    case 0x3079: // HIRAGANA LETTER BE
1733    case 0x307C: // HIRAGANA LETTER BO
1734    case 0x3094: // HIRAGANA LETTER VU
1735    case 0x30AC: // KATAKANA LETTER GA
1736    case 0x30AE: // KATAKANA LETTER GI
1737    case 0x30B0: // KATAKANA LETTER GU
1738    case 0x30B2: // KATAKANA LETTER GE
1739    case 0x30B4: // KATAKANA LETTER GO
1740    case 0x30B6: // KATAKANA LETTER ZA
1741    case 0x30B8: // KATAKANA LETTER ZI
1742    case 0x30BA: // KATAKANA LETTER ZU
1743    case 0x30BC: // KATAKANA LETTER ZE
1744    case 0x30BE: // KATAKANA LETTER ZO
1745    case 0x30C0: // KATAKANA LETTER DA
1746    case 0x30C2: // KATAKANA LETTER DI
1747    case 0x30C5: // KATAKANA LETTER DU
1748    case 0x30C7: // KATAKANA LETTER DE
1749    case 0x30C9: // KATAKANA LETTER DO
1750    case 0x30D0: // KATAKANA LETTER BA
1751    case 0x30D3: // KATAKANA LETTER BI
1752    case 0x30D6: // KATAKANA LETTER BU
1753    case 0x30D9: // KATAKANA LETTER BE
1754    case 0x30DC: // KATAKANA LETTER BO
1755    case 0x30F4: // KATAKANA LETTER VU
1756    case 0x30F7: // KATAKANA LETTER VA
1757    case 0x30F8: // KATAKANA LETTER VI
1758    case 0x30F9: // KATAKANA LETTER VE
1759    case 0x30FA: // KATAKANA LETTER VO
1760        return VoicedSoundMark;
1761    case 0x3071: // HIRAGANA LETTER PA
1762    case 0x3074: // HIRAGANA LETTER PI
1763    case 0x3077: // HIRAGANA LETTER PU
1764    case 0x307A: // HIRAGANA LETTER PE
1765    case 0x307D: // HIRAGANA LETTER PO
1766    case 0x30D1: // KATAKANA LETTER PA
1767    case 0x30D4: // KATAKANA LETTER PI
1768    case 0x30D7: // KATAKANA LETTER PU
1769    case 0x30DA: // KATAKANA LETTER PE
1770    case 0x30DD: // KATAKANA LETTER PO
1771        return SemiVoicedSoundMark;
1772    }
1773    return NoVoicedSoundMark;
1774}
1775
1776static inline bool isCombiningVoicedSoundMark(UChar character)
1777{
1778    switch (character) {
1779    case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1780    case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1781        return true;
1782    }
1783    return false;
1784}
1785
1786static inline bool containsKanaLetters(const String& pattern)
1787{
1788    const UChar* characters = pattern.characters();
1789    unsigned length = pattern.length();
1790    for (unsigned i = 0; i < length; ++i) {
1791        if (isKanaLetter(characters[i]))
1792            return true;
1793    }
1794    return false;
1795}
1796
1797static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1798{
1799    ASSERT(length);
1800
1801    buffer.resize(length);
1802
1803    UErrorCode status = U_ZERO_ERROR;
1804    size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1805    ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1806    ASSERT(bufferSize);
1807
1808    buffer.resize(bufferSize);
1809
1810    if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1811        return;
1812
1813    status = U_ZERO_ERROR;
1814    unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1815    ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1816}
1817
1818static bool isNonLatin1Separator(UChar32 character)
1819{
1820    ASSERT_ARG(character, character >= 256);
1821
1822    return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1823}
1824
1825static inline bool isSeparator(UChar32 character)
1826{
1827    static const bool latin1SeparatorTable[256] = {
1828        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1829        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1830        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1831        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1832        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1833        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1834        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1835        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1836        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1838        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1839        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1840        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1841        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1842        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1843        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1844    };
1845
1846    if (character < 256)
1847        return latin1SeparatorTable[character];
1848
1849    return isNonLatin1Separator(character);
1850}
1851
1852inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1853    : m_target(target)
1854    , m_options(options)
1855    , m_prefixLength(0)
1856    , m_atBreak(true)
1857    , m_needsMoreContext(options & AtWordStarts)
1858    , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1859{
1860    ASSERT(!m_target.isEmpty());
1861
1862    // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1863    // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1864    // to add tailoring on top of the locale-specific tailoring as of this writing.
1865    foldQuoteMarksAndSoftHyphens(m_target);
1866
1867    size_t targetLength = m_target.length();
1868    m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1869    m_overlap = m_buffer.capacity() / 4;
1870
1871    if ((m_options & AtWordStarts) && targetLength) {
1872        UChar32 targetFirstCharacter;
1873        U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1874        // Characters in the separator category never really occur at the beginning of a word,
1875        // so if the target begins with such a character, we just ignore the AtWordStart option.
1876        if (isSeparator(targetFirstCharacter)) {
1877            m_options &= ~AtWordStarts;
1878            m_needsMoreContext = false;
1879        }
1880    }
1881
1882    // Grab the single global searcher.
1883    // If we ever have a reason to do more than once search buffer at once, we'll have
1884    // to move to multiple searchers.
1885    lockSearcher();
1886
1887    UStringSearch* searcher = WebCore::searcher();
1888    UCollator* collator = usearch_getCollator(searcher);
1889
1890    UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1891    if (ucol_getStrength(collator) != strength) {
1892        ucol_setStrength(collator, strength);
1893        usearch_reset(searcher);
1894    }
1895
1896    UErrorCode status = U_ZERO_ERROR;
1897    usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1898    ASSERT(status == U_ZERO_ERROR);
1899
1900    // The kana workaround requires a normalized copy of the target string.
1901    if (m_targetRequiresKanaWorkaround)
1902        normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1903}
1904
1905inline SearchBuffer::~SearchBuffer()
1906{
1907    // Leave the static object pointing to a valid string.
1908    UErrorCode status = U_ZERO_ERROR;
1909    usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1910    ASSERT(status == U_ZERO_ERROR);
1911
1912    unlockSearcher();
1913}
1914
1915inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1916{
1917    ASSERT(length);
1918
1919    if (m_atBreak) {
1920        m_buffer.shrink(0);
1921        m_prefixLength = 0;
1922        m_atBreak = false;
1923    } else if (m_buffer.size() == m_buffer.capacity()) {
1924        memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1925        m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1926        m_buffer.shrink(m_overlap);
1927    }
1928
1929    size_t oldLength = m_buffer.size();
1930    size_t usableLength = min(m_buffer.capacity() - oldLength, length);
1931    ASSERT(usableLength);
1932    m_buffer.append(characters, usableLength);
1933    foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
1934    return usableLength;
1935}
1936
1937inline bool SearchBuffer::needsMoreContext() const
1938{
1939    return m_needsMoreContext;
1940}
1941
1942inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
1943{
1944    ASSERT(m_needsMoreContext);
1945    ASSERT(m_prefixLength == m_buffer.size());
1946
1947    if (!length)
1948        return;
1949
1950    m_atBreak = false;
1951
1952    size_t wordBoundaryContextStart = length;
1953    if (wordBoundaryContextStart) {
1954        U16_BACK_1(characters, 0, wordBoundaryContextStart);
1955        wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
1956    }
1957
1958    size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
1959    m_buffer.prepend(characters + length - usableLength, usableLength);
1960    m_prefixLength += usableLength;
1961
1962    if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
1963        m_needsMoreContext = false;
1964}
1965
1966inline bool SearchBuffer::atBreak() const
1967{
1968    return m_atBreak;
1969}
1970
1971inline void SearchBuffer::reachedBreak()
1972{
1973    m_atBreak = true;
1974}
1975
1976inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
1977{
1978    // This function implements the kana workaround. If usearch treats
1979    // it as a match, but we do not want to, then it's a "bad match".
1980    if (!m_targetRequiresKanaWorkaround)
1981        return false;
1982
1983    // Normalize into a match buffer. We reuse a single buffer rather than
1984    // creating a new one each time.
1985    normalizeCharacters(match, matchLength, m_normalizedMatch);
1986
1987    const UChar* a = m_normalizedTarget.begin();
1988    const UChar* aEnd = m_normalizedTarget.end();
1989
1990    const UChar* b = m_normalizedMatch.begin();
1991    const UChar* bEnd = m_normalizedMatch.end();
1992
1993    while (true) {
1994        // Skip runs of non-kana-letter characters. This is necessary so we can
1995        // correctly handle strings where the target and match have different-length
1996        // runs of characters that match, while still double checking the correctness
1997        // of matches of kana letters with other kana letters.
1998        while (a != aEnd && !isKanaLetter(*a))
1999            ++a;
2000        while (b != bEnd && !isKanaLetter(*b))
2001            ++b;
2002
2003        // If we reached the end of either the target or the match, we should have
2004        // reached the end of both; both should have the same number of kana letters.
2005        if (a == aEnd || b == bEnd) {
2006            ASSERT(a == aEnd);
2007            ASSERT(b == bEnd);
2008            return false;
2009        }
2010
2011        // Check for differences in the kana letter character itself.
2012        if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2013            return true;
2014        if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2015            return true;
2016        ++a;
2017        ++b;
2018
2019        // Check for differences in combining voiced sound marks found after the letter.
2020        while (1) {
2021            if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2022                if (b != bEnd && isCombiningVoicedSoundMark(*b))
2023                    return true;
2024                break;
2025            }
2026            if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2027                return true;
2028            if (*a != *b)
2029                return true;
2030            ++a;
2031            ++b;
2032        }
2033    }
2034}
2035
2036inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2037{
2038    ASSERT(m_options & AtWordStarts);
2039
2040    if (!start)
2041        return true;
2042
2043    if (m_options & TreatMedialCapitalAsWordStart) {
2044        int size = m_buffer.size();
2045        int offset = start;
2046        UChar32 firstCharacter;
2047        U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2048        UChar32 previousCharacter;
2049        U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2050
2051        if (isSeparator(firstCharacter)) {
2052            // The start of a separator run is a word start (".org" in "webkit.org").
2053            if (!isSeparator(previousCharacter))
2054                return true;
2055        } else if (isASCIIUpper(firstCharacter)) {
2056            // The start of an uppercase run is a word start ("Kit" in "WebKit").
2057            if (!isASCIIUpper(previousCharacter))
2058                return true;
2059            // The last character of an uppercase run followed by a non-separator, non-digit
2060            // is a word start ("Request" in "XMLHTTPRequest").
2061            offset = start;
2062            U16_FWD_1(m_buffer.data(), offset, size);
2063            UChar32 nextCharacter = 0;
2064            if (offset < size)
2065                U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2066            if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2067                return true;
2068        } else if (isASCIIDigit(firstCharacter)) {
2069            // The start of a digit run is a word start ("2" in "WebKit2").
2070            if (!isASCIIDigit(previousCharacter))
2071                return true;
2072        } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2073            // The start of a non-separator, non-uppercase, non-digit run is a word start,
2074            // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2075            return true;
2076        }
2077    }
2078
2079    size_t wordBreakSearchStart = start + length;
2080    while (wordBreakSearchStart > start)
2081        wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2082    return wordBreakSearchStart == start;
2083}
2084
2085inline size_t SearchBuffer::search(size_t& start)
2086{
2087    size_t size = m_buffer.size();
2088    if (m_atBreak) {
2089        if (!size)
2090            return 0;
2091    } else {
2092        if (size != m_buffer.capacity())
2093            return 0;
2094    }
2095
2096    UStringSearch* searcher = WebCore::searcher();
2097
2098    UErrorCode status = U_ZERO_ERROR;
2099    usearch_setText(searcher, m_buffer.data(), size, &status);
2100    ASSERT(status == U_ZERO_ERROR);
2101
2102    usearch_setOffset(searcher, m_prefixLength, &status);
2103    ASSERT(status == U_ZERO_ERROR);
2104
2105    int matchStart = usearch_next(searcher, &status);
2106    ASSERT(status == U_ZERO_ERROR);
2107
2108nextMatch:
2109    if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2110        ASSERT(matchStart == USEARCH_DONE);
2111        return 0;
2112    }
2113
2114    // Matches that start in the overlap area are only tentative.
2115    // The same match may appear later, matching more characters,
2116    // possibly including a combining character that's not yet in the buffer.
2117    if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2118        size_t overlap = m_overlap;
2119        if (m_options & AtWordStarts) {
2120            // Ensure that there is sufficient context before matchStart the next time around for
2121            // determining if it is at a word boundary.
2122            int wordBoundaryContextStart = matchStart;
2123            U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2124            wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2125            overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2126        }
2127        memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2128        m_prefixLength -= min(m_prefixLength, size - overlap);
2129        m_buffer.shrink(overlap);
2130        return 0;
2131    }
2132
2133    size_t matchedLength = usearch_getMatchedLength(searcher);
2134    ASSERT(matchStart + matchedLength <= size);
2135
2136    // If this match is "bad", move on to the next match.
2137    if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2138        matchStart = usearch_next(searcher, &status);
2139        ASSERT(status == U_ZERO_ERROR);
2140        goto nextMatch;
2141    }
2142
2143    size_t newSize = size - (matchStart + 1);
2144    memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2145    m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2146    m_buffer.shrink(newSize);
2147
2148    start = size - matchStart;
2149    return matchedLength;
2150}
2151
2152#else // !ICU_UNICODE
2153
2154inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2155    : m_target(options & CaseInsensitive ? target.foldCase() : target)
2156    , m_options(options)
2157    , m_buffer(m_target.length())
2158    , m_isCharacterStartBuffer(m_target.length())
2159    , m_isBufferFull(false)
2160    , m_cursor(0)
2161{
2162    ASSERT(!m_target.isEmpty());
2163    m_target.replace(noBreakSpace, ' ');
2164    foldQuoteMarksAndSoftHyphens(m_target);
2165}
2166
2167inline SearchBuffer::~SearchBuffer()
2168{
2169}
2170
2171inline void SearchBuffer::reachedBreak()
2172{
2173    m_cursor = 0;
2174    m_isBufferFull = false;
2175}
2176
2177inline bool SearchBuffer::atBreak() const
2178{
2179    return !m_cursor && !m_isBufferFull;
2180}
2181
2182inline void SearchBuffer::append(UChar c, bool isStart)
2183{
2184    m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2185    m_isCharacterStartBuffer[m_cursor] = isStart;
2186    if (++m_cursor == m_target.length()) {
2187        m_cursor = 0;
2188        m_isBufferFull = true;
2189    }
2190}
2191
2192inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2193{
2194    ASSERT(length);
2195    if (!(m_options & CaseInsensitive)) {
2196        append(characters[0], true);
2197        return 1;
2198    }
2199    const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2200    UChar foldedCharacters[maxFoldedCharacters];
2201    bool error;
2202    int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2203    ASSERT(!error);
2204    ASSERT(numFoldedCharacters);
2205    ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2206    if (!error && numFoldedCharacters) {
2207        numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2208        append(foldedCharacters[0], true);
2209        for (int i = 1; i < numFoldedCharacters; ++i)
2210            append(foldedCharacters[i], false);
2211    }
2212    return 1;
2213}
2214
2215inline bool SearchBuffer::needsMoreContext() const
2216{
2217    return false;
2218}
2219
2220void SearchBuffer::prependContext(const UChar*, size_t)
2221{
2222    ASSERT_NOT_REACHED();
2223}
2224
2225inline size_t SearchBuffer::search(size_t& start)
2226{
2227    if (!m_isBufferFull)
2228        return 0;
2229    if (!m_isCharacterStartBuffer[m_cursor])
2230        return 0;
2231
2232    size_t tailSpace = m_target.length() - m_cursor;
2233    if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2234        return 0;
2235    if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2236        return 0;
2237
2238    start = length();
2239
2240    // Now that we've found a match once, we don't want to find it again, because those
2241    // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2242    // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2243    // we want to get rid of that in the future we could track this with a separate boolean
2244    // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2245    m_isCharacterStartBuffer[m_cursor] = false;
2246
2247    return start;
2248}
2249
2250// Returns the number of characters that were appended to the buffer (what we are searching in).
2251// That's not necessarily the same length as the passed-in target string, because case folding
2252// can make two strings match even though they're not the same length.
2253size_t SearchBuffer::length() const
2254{
2255    size_t bufferSize = m_target.length();
2256    size_t length = 0;
2257    for (size_t i = 0; i < bufferSize; ++i)
2258        length += m_isCharacterStartBuffer[i];
2259    return length;
2260}
2261
2262#endif // !ICU_UNICODE
2263
2264// --------
2265
2266int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2267{
2268    int length = 0;
2269    for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2270        length += it.length();
2271
2272    return length;
2273}
2274
2275PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2276{
2277    CharacterIterator entireRangeIterator(entireRange);
2278    return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2279}
2280
2281PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2282{
2283    RefPtr<Range> resultRange = scope->document()->createRange();
2284
2285    int docTextPosition = 0;
2286    int rangeEnd = rangeLocation + rangeLength;
2287    bool startRangeFound = false;
2288
2289    RefPtr<Range> textRunRange;
2290
2291    TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2292
2293    // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2294    if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2295        textRunRange = it.range();
2296
2297        ExceptionCode ec = 0;
2298        resultRange->setStart(textRunRange->startContainer(), 0, ec);
2299        ASSERT(!ec);
2300        resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2301        ASSERT(!ec);
2302
2303        return resultRange.release();
2304    }
2305
2306    for (; !it.atEnd(); it.advance()) {
2307        int len = it.length();
2308        textRunRange = it.range();
2309
2310        bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2311        bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2312
2313        // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2314        // in those cases that textRunRange is used.
2315        if (foundEnd) {
2316            // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2317            // position for emitted '\n's.
2318            if (len == 1 && it.characters()[0] == '\n') {
2319                scope->document()->updateLayoutIgnorePendingStylesheets();
2320                it.advance();
2321                if (!it.atEnd()) {
2322                    RefPtr<Range> range = it.range();
2323                    ExceptionCode ec = 0;
2324                    textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2325                    ASSERT(!ec);
2326                } else {
2327                    Position runStart = textRunRange->startPosition();
2328                    Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2329                    if (runEnd.isNotNull()) {
2330                        ExceptionCode ec = 0;
2331                        textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2332                        ASSERT(!ec);
2333                    }
2334                }
2335            }
2336        }
2337
2338        if (foundStart) {
2339            startRangeFound = true;
2340            int exception = 0;
2341            if (textRunRange->startContainer()->isTextNode()) {
2342                int offset = rangeLocation - docTextPosition;
2343                resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2344            } else {
2345                if (rangeLocation == docTextPosition)
2346                    resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2347                else
2348                    resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2349            }
2350        }
2351
2352        if (foundEnd) {
2353            int exception = 0;
2354            if (textRunRange->startContainer()->isTextNode()) {
2355                int offset = rangeEnd - docTextPosition;
2356                resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2357            } else {
2358                if (rangeEnd == docTextPosition)
2359                    resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2360                else
2361                    resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2362            }
2363            docTextPosition += len;
2364            break;
2365        }
2366        docTextPosition += len;
2367    }
2368
2369    if (!startRangeFound)
2370        return 0;
2371
2372    if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2373        int exception = 0;
2374        resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2375    }
2376
2377    return resultRange.release();
2378}
2379
2380bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2381{
2382    location = notFound;
2383    length = 0;
2384
2385    if (!range->startContainer())
2386        return false;
2387
2388    Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2389    Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2390
2391    // The critical assumption is that this only gets called with ranges that
2392    // concentrate on a given area containing the selection root. This is done
2393    // because of text fields and textareas. The DOM for those is not
2394    // directly in the document DOM, so ensure that the range does not cross a
2395    // boundary of one of those.
2396    if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2397        return false;
2398    if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2399        return false;
2400
2401    RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2402    ASSERT(testRange->startContainer() == scope);
2403    location = TextIterator::rangeLength(testRange.get());
2404
2405    ExceptionCode ec;
2406    testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2407    ASSERT(testRange->startContainer() == scope);
2408    length = TextIterator::rangeLength(testRange.get()) - location;
2409    return true;
2410}
2411
2412// --------
2413
2414UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2415{
2416    UChar* result = 0;
2417
2418    // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2419    // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2420    static const unsigned cMaxSegmentSize = 1 << 16;
2421    bufferLength = 0;
2422    typedef pair<UChar*, unsigned> TextSegment;
2423    OwnPtr<Vector<TextSegment> > textSegments;
2424    Vector<UChar> textBuffer;
2425    textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2426    TextIteratorBehavior behavior = defaultBehavior;
2427    if (!isDisplayString)
2428        behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2429
2430    for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2431        if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2432            UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2433            if (!newSegmentBuffer)
2434                goto exit;
2435            memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2436            if (!textSegments)
2437                textSegments = adoptPtr(new Vector<TextSegment>);
2438            textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2439            textBuffer.clear();
2440        }
2441        textBuffer.append(it.characters(), it.length());
2442        bufferLength += it.length();
2443    }
2444
2445    if (!bufferLength)
2446        return 0;
2447
2448    // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2449    result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2450    if (!result)
2451        goto exit;
2452
2453    {
2454        UChar* resultPos = result;
2455        if (textSegments) {
2456            unsigned size = textSegments->size();
2457            for (unsigned i = 0; i < size; ++i) {
2458                const TextSegment& segment = textSegments->at(i);
2459                memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2460                resultPos += segment.second;
2461            }
2462        }
2463        memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2464    }
2465
2466exit:
2467    if (textSegments) {
2468        unsigned size = textSegments->size();
2469        for (unsigned i = 0; i < size; ++i)
2470            free(textSegments->at(i).first);
2471    }
2472
2473    if (isDisplayString && r->ownerDocument())
2474        r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2475
2476    return result;
2477}
2478
2479String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2480{
2481    unsigned length;
2482    UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2483    if (!buf)
2484        return "";
2485    String result(buf, length);
2486    free(buf);
2487    return result;
2488}
2489
2490static inline bool isAllCollapsibleWhitespace(const String& string)
2491{
2492    const UChar* characters = string.characters();
2493    unsigned length = string.length();
2494    for (unsigned i = 0; i < length; ++i) {
2495        if (!isCollapsibleWhitespace(characters[i]))
2496            return false;
2497    }
2498    return true;
2499}
2500
2501static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2502{
2503    ExceptionCode ec = 0;
2504    RefPtr<Range> result = range->cloneRange(ec);
2505    ASSERT(!ec);
2506    result->collapse(!forward, ec);
2507    ASSERT(!ec);
2508    return result.release();
2509}
2510
2511static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2512{
2513    matchStart = 0;
2514    size_t matchLength = 0;
2515
2516    SearchBuffer buffer(target, options);
2517
2518    if (buffer.needsMoreContext()) {
2519        RefPtr<Range> startRange = it.range();
2520        RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2521        ExceptionCode ec = 0;
2522        beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2523        for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2524            buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2525            if (!buffer.needsMoreContext())
2526                break;
2527        }
2528    }
2529
2530    while (!it.atEnd()) {
2531        it.advance(buffer.append(it.characters(), it.length()));
2532tryAgain:
2533        size_t matchStartOffset;
2534        if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2535            // Note that we found a match, and where we found it.
2536            size_t lastCharacterInBufferOffset = it.characterOffset();
2537            ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2538            matchStart = lastCharacterInBufferOffset - matchStartOffset;
2539            matchLength = newMatchLength;
2540            // If searching forward, stop on the first match.
2541            // If searching backward, don't stop, so we end up with the last match.
2542            if (!(options & Backwards))
2543                break;
2544            goto tryAgain;
2545        }
2546        if (it.atBreak() && !buffer.atBreak()) {
2547            buffer.reachedBreak();
2548            goto tryAgain;
2549        }
2550    }
2551
2552    return matchLength;
2553}
2554
2555PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2556{
2557    // First, find the text.
2558    size_t matchStart;
2559    size_t matchLength;
2560    {
2561        CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2562        matchLength = findPlainText(findIterator, target, options, matchStart);
2563        if (!matchLength)
2564            return collapsedToBoundary(range, !(options & Backwards));
2565    }
2566
2567    // Then, find the document position of the start and the end of the text.
2568    CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2569    return characterSubrange(computeRangeIterator, matchStart, matchLength);
2570}
2571
2572}
2573