1/*
2*******************************************************************************
3* Copyright (C) 1996-2014, International Business Machines Corporation and
4* others. All Rights Reserved.
5*******************************************************************************
6*/
7
8/*
9* File coleitr.cpp
10*
11* Created by: Helena Shih
12*
13* Modification History:
14*
15*  Date      Name        Description
16*
17*  6/23/97   helena      Adding comments to make code more readable.
18* 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
19* 12/10/99   aliu        Ported Thai collation support from Java.
20* 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
21* 02/19/01   swquek      Removed CollationElementIterator() since it is
22*                        private constructor and no calls are made to it
23* 2012-2014  markus      Rewritten in C++ again.
24*/
25
26#include "unicode/utypes.h"
27
28#if !UCONFIG_NO_COLLATION
29
30#include "unicode/coleitr.h"
31#include "unicode/tblcoll.h"
32#include "unicode/ustring.h"
33#include "cmemory.h"
34#include "collation.h"
35#include "collationdata.h"
36#include "collationiterator.h"
37#include "collationsets.h"
38#include "collationtailoring.h"
39#include "uassert.h"
40#include "uhash.h"
41#include "utf16collationiterator.h"
42#include "uvectr32.h"
43
44/* Constants --------------------------------------------------------------- */
45
46U_NAMESPACE_BEGIN
47
48UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
49
50/* CollationElementIterator public constructor/destructor ------------------ */
51
52CollationElementIterator::CollationElementIterator(
53                                         const CollationElementIterator& other)
54        : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
55    *this = other;
56}
57
58CollationElementIterator::~CollationElementIterator()
59{
60    delete iter_;
61    delete offsets_;
62}
63
64/* CollationElementIterator public methods --------------------------------- */
65
66namespace {
67
68uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
69    return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
70}
71uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
72    return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
73}
74UBool ceNeedsTwoParts(int64_t ce) {
75    return (ce & INT64_C(0xffff00ff003f)) != 0;
76}
77
78}  // namespace
79
80int32_t CollationElementIterator::getOffset() const
81{
82    if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
83        // CollationIterator::previousCE() decrements the CEs length
84        // while it pops CEs from its internal buffer.
85        int32_t i = iter_->getCEsLength();
86        if (otherHalf_ != 0) {
87            // Return the trailing CE offset while we are in the middle of a 64-bit CE.
88            ++i;
89        }
90        U_ASSERT(i < offsets_->size());
91        return offsets_->elementAti(i);
92    }
93    return iter_->getOffset();
94}
95
96/**
97* Get the ordering priority of the next character in the string.
98* @return the next character's ordering. Returns NULLORDER if an error has
99*         occured or if the end of string has been reached
100*/
101int32_t CollationElementIterator::next(UErrorCode& status)
102{
103    if (U_FAILURE(status)) { return NULLORDER; }
104    if (dir_ > 1) {
105        // Continue forward iteration. Test this first.
106        if (otherHalf_ != 0) {
107            uint32_t oh = otherHalf_;
108            otherHalf_ = 0;
109            return oh;
110        }
111    } else if (dir_ == 1) {
112        // next() after setOffset()
113        dir_ = 2;
114    } else if (dir_ == 0) {
115        // The iter_ is already reset to the start of the text.
116        dir_ = 2;
117    } else /* dir_ < 0 */ {
118        // illegal change of direction
119        status = U_INVALID_STATE_ERROR;
120        return NULLORDER;
121    }
122    // No need to keep all CEs in the buffer when we iterate.
123    iter_->clearCEsIfNoneRemaining();
124    int64_t ce = iter_->nextCE(status);
125    if (ce == Collation::NO_CE) { return NULLORDER; }
126    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
127    uint32_t p = (uint32_t)(ce >> 32);
128    uint32_t lower32 = (uint32_t)ce;
129    uint32_t firstHalf = getFirstHalf(p, lower32);
130    uint32_t secondHalf = getSecondHalf(p, lower32);
131    if (secondHalf != 0) {
132        otherHalf_ = secondHalf | 0xc0;  // continuation CE
133    }
134    return firstHalf;
135}
136
137UBool CollationElementIterator::operator!=(
138                                  const CollationElementIterator& other) const
139{
140    return !(*this == other);
141}
142
143UBool CollationElementIterator::operator==(
144                                    const CollationElementIterator& that) const
145{
146    if (this == &that) {
147        return TRUE;
148    }
149
150    return
151        (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
152        otherHalf_ == that.otherHalf_ &&
153        normalizeDir() == that.normalizeDir() &&
154        string_ == that.string_ &&
155        *iter_ == *that.iter_;
156}
157
158/**
159* Get the ordering priority of the previous collation element in the string.
160* @param status the error code status.
161* @return the previous element's ordering. Returns NULLORDER if an error has
162*         occured or if the start of string has been reached.
163*/
164int32_t CollationElementIterator::previous(UErrorCode& status)
165{
166    if (U_FAILURE(status)) { return NULLORDER; }
167    if (dir_ < 0) {
168        // Continue backwards iteration. Test this first.
169        if (otherHalf_ != 0) {
170            uint32_t oh = otherHalf_;
171            otherHalf_ = 0;
172            return oh;
173        }
174    } else if (dir_ == 0) {
175        iter_->resetToOffset(string_.length());
176        dir_ = -1;
177    } else if (dir_ == 1) {
178        // previous() after setOffset()
179        dir_ = -1;
180    } else /* dir_ > 1 */ {
181        // illegal change of direction
182        status = U_INVALID_STATE_ERROR;
183        return NULLORDER;
184    }
185    if (offsets_ == NULL) {
186        offsets_ = new UVector32(status);
187        if (offsets_ == NULL) {
188            status = U_MEMORY_ALLOCATION_ERROR;
189            return NULLORDER;
190        }
191    }
192    // If we already have expansion CEs, then we also have offsets.
193    // Otherwise remember the trailing offset in case we need to
194    // write offsets for an artificial expansion.
195    int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
196    int64_t ce = iter_->previousCE(*offsets_, status);
197    if (ce == Collation::NO_CE) { return NULLORDER; }
198    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
199    uint32_t p = (uint32_t)(ce >> 32);
200    uint32_t lower32 = (uint32_t)ce;
201    uint32_t firstHalf = getFirstHalf(p, lower32);
202    uint32_t secondHalf = getSecondHalf(p, lower32);
203    if (secondHalf != 0) {
204        if (offsets_->isEmpty()) {
205            // When we convert a single 64-bit CE into two 32-bit CEs,
206            // we need to make this artificial expansion behave like a normal expansion.
207            // See CollationIterator::previousCE().
208            offsets_->addElement(iter_->getOffset(), status);
209            offsets_->addElement(limitOffset, status);
210        }
211        otherHalf_ = firstHalf;
212        return secondHalf | 0xc0;  // continuation CE
213    }
214    return firstHalf;
215}
216
217/**
218* Resets the cursor to the beginning of the string.
219*/
220void CollationElementIterator::reset()
221{
222    iter_ ->resetToOffset(0);
223    otherHalf_ = 0;
224    dir_ = 0;
225}
226
227void CollationElementIterator::setOffset(int32_t newOffset,
228                                         UErrorCode& status)
229{
230    if (U_FAILURE(status)) { return; }
231    if (0 < newOffset && newOffset < string_.length()) {
232        int32_t offset = newOffset;
233        do {
234            UChar c = string_.charAt(offset);
235            if (!rbc_->isUnsafe(c) ||
236                    (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
237                break;
238            }
239            // Back up to before this unsafe character.
240            --offset;
241        } while (offset > 0);
242        if (offset < newOffset) {
243            // We might have backed up more than necessary.
244            // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
245            // but for text "chu" setOffset(2) should remain at 2
246            // although we initially back up to offset 0.
247            // Find the last safe offset no greater than newOffset by iterating forward.
248            int32_t lastSafeOffset = offset;
249            do {
250                iter_->resetToOffset(lastSafeOffset);
251                do {
252                    iter_->nextCE(status);
253                    if (U_FAILURE(status)) { return; }
254                } while ((offset = iter_->getOffset()) == lastSafeOffset);
255                if (offset <= newOffset) {
256                    lastSafeOffset = offset;
257                }
258            } while (offset < newOffset);
259            newOffset = lastSafeOffset;
260        }
261    }
262    iter_->resetToOffset(newOffset);
263    otherHalf_ = 0;
264    dir_ = 1;
265}
266
267/**
268* Sets the source to the new source string.
269*/
270void CollationElementIterator::setText(const UnicodeString& source,
271                                       UErrorCode& status)
272{
273    if (U_FAILURE(status)) {
274        return;
275    }
276
277    string_ = source;
278    const UChar *s = string_.getBuffer();
279    CollationIterator *newIter;
280    UBool numeric = rbc_->settings->isNumeric();
281    if (rbc_->settings->dontCheckFCD()) {
282        newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
283    } else {
284        newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
285    }
286    if (newIter == NULL) {
287        status = U_MEMORY_ALLOCATION_ERROR;
288        return;
289    }
290    delete iter_;
291    iter_ = newIter;
292    otherHalf_ = 0;
293    dir_ = 0;
294}
295
296// Sets the source to the new character iterator.
297void CollationElementIterator::setText(CharacterIterator& source,
298                                       UErrorCode& status)
299{
300    if (U_FAILURE(status))
301        return;
302
303    source.getText(string_);
304    setText(string_, status);
305}
306
307int32_t CollationElementIterator::strengthOrder(int32_t order) const
308{
309    UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
310    // Mask off the unwanted differences.
311    if (s == UCOL_PRIMARY) {
312        order &= 0xffff0000;
313    }
314    else if (s == UCOL_SECONDARY) {
315        order &= 0xffffff00;
316    }
317
318    return order;
319}
320
321/* CollationElementIterator private constructors/destructors --------------- */
322
323/**
324* This is the "real" constructor for this class; it constructs an iterator
325* over the source text using the specified collator
326*/
327CollationElementIterator::CollationElementIterator(
328                                               const UnicodeString &source,
329                                               const RuleBasedCollator *coll,
330                                               UErrorCode &status)
331        : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
332    setText(source, status);
333}
334
335/**
336* This is the "real" constructor for this class; it constructs an iterator over
337* the source text using the specified collator
338*/
339CollationElementIterator::CollationElementIterator(
340                                           const CharacterIterator &source,
341                                           const RuleBasedCollator *coll,
342                                           UErrorCode &status)
343        : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
344    // We only call source.getText() which should be const anyway.
345    setText(const_cast<CharacterIterator &>(source), status);
346}
347
348/* CollationElementIterator private methods -------------------------------- */
349
350const CollationElementIterator& CollationElementIterator::operator=(
351                                         const CollationElementIterator& other)
352{
353    if (this == &other) {
354        return *this;
355    }
356
357    CollationIterator *newIter;
358    const FCDUTF16CollationIterator *otherFCDIter =
359            dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
360    if(otherFCDIter != NULL) {
361        newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
362    } else {
363        const UTF16CollationIterator *otherIter =
364                dynamic_cast<const UTF16CollationIterator *>(other.iter_);
365        if(otherIter != NULL) {
366            newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
367        } else {
368            newIter = NULL;
369        }
370    }
371    if(newIter != NULL) {
372        delete iter_;
373        iter_ = newIter;
374        rbc_ = other.rbc_;
375        otherHalf_ = other.otherHalf_;
376        dir_ = other.dir_;
377
378        string_ = other.string_;
379    }
380    if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
381        UErrorCode errorCode = U_ZERO_ERROR;
382        if(offsets_ == NULL) {
383            offsets_ = new UVector32(other.offsets_->size(), errorCode);
384        }
385        if(offsets_ != NULL) {
386            offsets_->assign(*other.offsets_, errorCode);
387        }
388    }
389    return *this;
390}
391
392namespace {
393
394class MaxExpSink : public ContractionsAndExpansions::CESink {
395public:
396    MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
397    virtual ~MaxExpSink();
398    virtual void handleCE(int64_t /*ce*/) {}
399    virtual void handleExpansion(const int64_t ces[], int32_t length) {
400        if (length <= 1) {
401            // We do not need to add single CEs into the map.
402            return;
403        }
404        int32_t count = 0;  // number of CE "halves"
405        for (int32_t i = 0; i < length; ++i) {
406            count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
407        }
408        // last "half" of the last CE
409        int64_t ce = ces[length - 1];
410        uint32_t p = (uint32_t)(ce >> 32);
411        uint32_t lower32 = (uint32_t)ce;
412        uint32_t lastHalf = getSecondHalf(p, lower32);
413        if (lastHalf == 0) {
414            lastHalf = getFirstHalf(p, lower32);
415            U_ASSERT(lastHalf != 0);
416        } else {
417            lastHalf |= 0xc0;  // old-style continuation CE
418        }
419        if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
420            uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
421        }
422    }
423
424private:
425    UHashtable *maxExpansions;
426    UErrorCode &errorCode;
427};
428
429MaxExpSink::~MaxExpSink() {}
430
431}  // namespace
432
433UHashtable *
434CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
435    if (U_FAILURE(errorCode)) { return NULL; }
436    UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
437                                           uhash_compareLong, &errorCode);
438    if (U_FAILURE(errorCode)) { return NULL; }
439    MaxExpSink sink(maxExpansions, errorCode);
440    ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
441    if (U_FAILURE(errorCode)) {
442        uhash_close(maxExpansions);
443        return NULL;
444    }
445    return maxExpansions;
446}
447
448int32_t
449CollationElementIterator::getMaxExpansion(int32_t order) const {
450    return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
451}
452
453int32_t
454CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
455    if (order == 0) { return 1; }
456    int32_t max;
457    if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
458        return max;
459    }
460    if ((order & 0xc0) == 0xc0) {
461        // old-style continuation CE
462        return 2;
463    } else {
464        return 1;
465    }
466}
467
468U_NAMESPACE_END
469
470#endif /* #if !UCONFIG_NO_COLLATION */
471