1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2016, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* utf8collationiterator.h
9*
10* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11* created by: Markus W. Scherer
12*/
13
14#ifndef __UTF8COLLATIONITERATOR_H__
15#define __UTF8COLLATIONITERATOR_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "cmemory.h"
22#include "collation.h"
23#include "collationdata.h"
24#include "collationiterator.h"
25#include "normalizer2impl.h"
26
27U_NAMESPACE_BEGIN
28
29/**
30 * UTF-8 collation element and character iterator.
31 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32 * Unnormalized text is handled by a subclass.
33 */
34class U_I18N_API UTF8CollationIterator : public CollationIterator {
35public:
36    UTF8CollationIterator(const CollationData *d, UBool numeric,
37                          const uint8_t *s, int32_t p, int32_t len)
38            : CollationIterator(d, numeric),
39              u8(s), pos(p), length(len) {}
40
41    virtual ~UTF8CollationIterator();
42
43    virtual void resetToOffset(int32_t newOffset);
44
45    virtual int32_t getOffset() const;
46
47    virtual UChar32 nextCodePoint(UErrorCode &errorCode);
48
49    virtual UChar32 previousCodePoint(UErrorCode &errorCode);
50
51protected:
52    /**
53     * For byte sequences that are illegal in UTF-8, an error value may be returned
54     * together with a bogus code point. The caller will ignore that code point.
55     *
56     * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57     * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
58     *
59     * Valid lead surrogates are returned from inside a normalized text segment,
60     * where handleGetTrailSurrogate() will return the matching trail surrogate.
61     */
62    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
63
64    virtual UBool foundNULTerminator();
65
66    virtual UBool forbidSurrogateCodePoints() const;
67
68    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
69
70    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
71
72    const uint8_t *u8;
73    int32_t pos;
74    int32_t length;  // <0 for NUL-terminated strings
75};
76
77/**
78 * Incrementally checks the input text for FCD and normalizes where necessary.
79 */
80class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81public:
82    FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83                             const uint8_t *s, int32_t p, int32_t len)
84            : UTF8CollationIterator(data, numeric, s, p, len),
85              state(CHECK_FWD), start(p),
86              nfcImpl(data->nfcImpl) {}
87
88    virtual ~FCDUTF8CollationIterator();
89
90    virtual void resetToOffset(int32_t newOffset);
91
92    virtual int32_t getOffset() const;
93
94    virtual UChar32 nextCodePoint(UErrorCode &errorCode);
95
96    virtual UChar32 previousCodePoint(UErrorCode &errorCode);
97
98protected:
99    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
100
101    virtual UChar handleGetTrailSurrogate();
102
103    virtual UBool foundNULTerminator();
104
105    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
106
107    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
108
109private:
110    UBool nextHasLccc() const;
111    UBool previousHasTccc() const;
112
113    /**
114     * Switches to forward checking if possible.
115     */
116    void switchToForward();
117
118    /**
119     * Extends the FCD text segment forward or normalizes around pos.
120     * @return TRUE if success
121     */
122    UBool nextSegment(UErrorCode &errorCode);
123
124    /**
125     * Switches to backward checking.
126     */
127    void switchToBackward();
128
129    /**
130     * Extends the FCD text segment backward or normalizes around pos.
131     * @return TRUE if success
132     */
133    UBool previousSegment(UErrorCode &errorCode);
134
135    UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136
137    enum State {
138        /**
139         * The input text [start..pos[ passes the FCD check.
140         * Moving forward checks incrementally.
141         * limit is undefined.
142         */
143        CHECK_FWD,
144        /**
145         * The input text [pos..limit[ passes the FCD check.
146         * Moving backward checks incrementally.
147         * start is undefined.
148         */
149        CHECK_BWD,
150        /**
151         * The input text [start..limit[ passes the FCD check.
152         * pos tracks the current text index.
153         */
154        IN_FCD_SEGMENT,
155        /**
156         * The input text [start..limit[ failed the FCD check and was normalized.
157         * pos tracks the current index in the normalized string.
158         */
159        IN_NORMALIZED
160    };
161
162    State state;
163
164    int32_t start;
165    int32_t limit;
166
167    const Normalizer2Impl &nfcImpl;
168    UnicodeString normalized;
169};
170
171U_NAMESPACE_END
172
173#endif  // !UCONFIG_NO_COLLATION
174#endif  // __UTF8COLLATIONITERATOR_H__
175