1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010-2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  ucharstrie.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010nov14
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/appendable.h"
19#include "unicode/ucharstrie.h"
20#include "unicode/uobject.h"
21#include "unicode/utf16.h"
22#include "cmemory.h"
23#include "uassert.h"
24
25U_NAMESPACE_BEGIN
26
27UCharsTrie::~UCharsTrie() {
28    uprv_free(ownedArray_);
29}
30
31UStringTrieResult
32UCharsTrie::current() const {
33    const UChar *pos=pos_;
34    if(pos==NULL) {
35        return USTRINGTRIE_NO_MATCH;
36    } else {
37        int32_t node;
38        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
39                valueResult(node) : USTRINGTRIE_NO_VALUE;
40    }
41}
42
43UStringTrieResult
44UCharsTrie::firstForCodePoint(UChar32 cp) {
45    return cp<=0xffff ?
46        first(cp) :
47        (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
48            next(U16_TRAIL(cp)) :
49            USTRINGTRIE_NO_MATCH);
50}
51
52UStringTrieResult
53UCharsTrie::nextForCodePoint(UChar32 cp) {
54    return cp<=0xffff ?
55        next(cp) :
56        (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
57            next(U16_TRAIL(cp)) :
58            USTRINGTRIE_NO_MATCH);
59}
60
61UStringTrieResult
62UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
63    // Branch according to the current unit.
64    if(length==0) {
65        length=*pos++;
66    }
67    ++length;
68    // The length of the branch is the number of units to select from.
69    // The data structure encodes a binary search.
70    while(length>kMaxBranchLinearSubNodeLength) {
71        if(uchar<*pos++) {
72            length>>=1;
73            pos=jumpByDelta(pos);
74        } else {
75            length=length-(length>>1);
76            pos=skipDelta(pos);
77        }
78    }
79    // Drop down to linear search for the last few units.
80    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
81    // and divides length by 2.
82    do {
83        if(uchar==*pos++) {
84            UStringTrieResult result;
85            int32_t node=*pos;
86            if(node&kValueIsFinal) {
87                // Leave the final value for getValue() to read.
88                result=USTRINGTRIE_FINAL_VALUE;
89            } else {
90                // Use the non-final value as the jump delta.
91                ++pos;
92                // int32_t delta=readValue(pos, node);
93                int32_t delta;
94                if(node<kMinTwoUnitValueLead) {
95                    delta=node;
96                } else if(node<kThreeUnitValueLead) {
97                    delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
98                } else {
99                    delta=(pos[0]<<16)|pos[1];
100                    pos+=2;
101                }
102                // end readValue()
103                pos+=delta;
104                node=*pos;
105                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
106            }
107            pos_=pos;
108            return result;
109        }
110        --length;
111        pos=skipValue(pos);
112    } while(length>1);
113    if(uchar==*pos++) {
114        pos_=pos;
115        int32_t node=*pos;
116        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
117    } else {
118        stop();
119        return USTRINGTRIE_NO_MATCH;
120    }
121}
122
123UStringTrieResult
124UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
125    int32_t node=*pos++;
126    for(;;) {
127        if(node<kMinLinearMatch) {
128            return branchNext(pos, node, uchar);
129        } else if(node<kMinValueLead) {
130            // Match the first of length+1 units.
131            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
132            if(uchar==*pos++) {
133                remainingMatchLength_=--length;
134                pos_=pos;
135                return (length<0 && (node=*pos)>=kMinValueLead) ?
136                        valueResult(node) : USTRINGTRIE_NO_VALUE;
137            } else {
138                // No match.
139                break;
140            }
141        } else if(node&kValueIsFinal) {
142            // No further matching units.
143            break;
144        } else {
145            // Skip intermediate value.
146            pos=skipNodeValue(pos, node);
147            node&=kNodeTypeMask;
148        }
149    }
150    stop();
151    return USTRINGTRIE_NO_MATCH;
152}
153
154UStringTrieResult
155UCharsTrie::next(int32_t uchar) {
156    const UChar *pos=pos_;
157    if(pos==NULL) {
158        return USTRINGTRIE_NO_MATCH;
159    }
160    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
161    if(length>=0) {
162        // Remaining part of a linear-match node.
163        if(uchar==*pos++) {
164            remainingMatchLength_=--length;
165            pos_=pos;
166            int32_t node;
167            return (length<0 && (node=*pos)>=kMinValueLead) ?
168                    valueResult(node) : USTRINGTRIE_NO_VALUE;
169        } else {
170            stop();
171            return USTRINGTRIE_NO_MATCH;
172        }
173    }
174    return nextImpl(pos, uchar);
175}
176
177UStringTrieResult
178UCharsTrie::next(const UChar *s, int32_t sLength) {
179    if(sLength<0 ? *s==0 : sLength==0) {
180        // Empty input.
181        return current();
182    }
183    const UChar *pos=pos_;
184    if(pos==NULL) {
185        return USTRINGTRIE_NO_MATCH;
186    }
187    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
188    for(;;) {
189        // Fetch the next input unit, if there is one.
190        // Continue a linear-match node without rechecking sLength<0.
191        int32_t uchar;
192        if(sLength<0) {
193            for(;;) {
194                if((uchar=*s++)==0) {
195                    remainingMatchLength_=length;
196                    pos_=pos;
197                    int32_t node;
198                    return (length<0 && (node=*pos)>=kMinValueLead) ?
199                            valueResult(node) : USTRINGTRIE_NO_VALUE;
200                }
201                if(length<0) {
202                    remainingMatchLength_=length;
203                    break;
204                }
205                if(uchar!=*pos) {
206                    stop();
207                    return USTRINGTRIE_NO_MATCH;
208                }
209                ++pos;
210                --length;
211            }
212        } else {
213            for(;;) {
214                if(sLength==0) {
215                    remainingMatchLength_=length;
216                    pos_=pos;
217                    int32_t node;
218                    return (length<0 && (node=*pos)>=kMinValueLead) ?
219                            valueResult(node) : USTRINGTRIE_NO_VALUE;
220                }
221                uchar=*s++;
222                --sLength;
223                if(length<0) {
224                    remainingMatchLength_=length;
225                    break;
226                }
227                if(uchar!=*pos) {
228                    stop();
229                    return USTRINGTRIE_NO_MATCH;
230                }
231                ++pos;
232                --length;
233            }
234        }
235        int32_t node=*pos++;
236        for(;;) {
237            if(node<kMinLinearMatch) {
238                UStringTrieResult result=branchNext(pos, node, uchar);
239                if(result==USTRINGTRIE_NO_MATCH) {
240                    return USTRINGTRIE_NO_MATCH;
241                }
242                // Fetch the next input unit, if there is one.
243                if(sLength<0) {
244                    if((uchar=*s++)==0) {
245                        return result;
246                    }
247                } else {
248                    if(sLength==0) {
249                        return result;
250                    }
251                    uchar=*s++;
252                    --sLength;
253                }
254                if(result==USTRINGTRIE_FINAL_VALUE) {
255                    // No further matching units.
256                    stop();
257                    return USTRINGTRIE_NO_MATCH;
258                }
259                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
260                node=*pos++;
261            } else if(node<kMinValueLead) {
262                // Match length+1 units.
263                length=node-kMinLinearMatch;  // Actual match length minus 1.
264                if(uchar!=*pos) {
265                    stop();
266                    return USTRINGTRIE_NO_MATCH;
267                }
268                ++pos;
269                --length;
270                break;
271            } else if(node&kValueIsFinal) {
272                // No further matching units.
273                stop();
274                return USTRINGTRIE_NO_MATCH;
275            } else {
276                // Skip intermediate value.
277                pos=skipNodeValue(pos, node);
278                node&=kNodeTypeMask;
279            }
280        }
281    }
282}
283
284const UChar *
285UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
286                                      UBool haveUniqueValue, int32_t &uniqueValue) {
287    while(length>kMaxBranchLinearSubNodeLength) {
288        ++pos;  // ignore the comparison unit
289        if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
290            return NULL;
291        }
292        length=length-(length>>1);
293        pos=skipDelta(pos);
294    }
295    do {
296        ++pos;  // ignore a comparison unit
297        // handle its value
298        int32_t node=*pos++;
299        UBool isFinal=(UBool)(node>>15);
300        node&=0x7fff;
301        int32_t value=readValue(pos, node);
302        pos=skipValue(pos, node);
303        if(isFinal) {
304            if(haveUniqueValue) {
305                if(value!=uniqueValue) {
306                    return NULL;
307                }
308            } else {
309                uniqueValue=value;
310                haveUniqueValue=TRUE;
311            }
312        } else {
313            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
314                return NULL;
315            }
316            haveUniqueValue=TRUE;
317        }
318    } while(--length>1);
319    return pos+1;  // ignore the last comparison unit
320}
321
322UBool
323UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
324    int32_t node=*pos++;
325    for(;;) {
326        if(node<kMinLinearMatch) {
327            if(node==0) {
328                node=*pos++;
329            }
330            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
331            if(pos==NULL) {
332                return FALSE;
333            }
334            haveUniqueValue=TRUE;
335            node=*pos++;
336        } else if(node<kMinValueLead) {
337            // linear-match node
338            pos+=node-kMinLinearMatch+1;  // Ignore the match units.
339            node=*pos++;
340        } else {
341            UBool isFinal=(UBool)(node>>15);
342            int32_t value;
343            if(isFinal) {
344                value=readValue(pos, node&0x7fff);
345            } else {
346                value=readNodeValue(pos, node);
347            }
348            if(haveUniqueValue) {
349                if(value!=uniqueValue) {
350                    return FALSE;
351                }
352            } else {
353                uniqueValue=value;
354                haveUniqueValue=TRUE;
355            }
356            if(isFinal) {
357                return TRUE;
358            }
359            pos=skipNodeValue(pos, node);
360            node&=kNodeTypeMask;
361        }
362    }
363}
364
365int32_t
366UCharsTrie::getNextUChars(Appendable &out) const {
367    const UChar *pos=pos_;
368    if(pos==NULL) {
369        return 0;
370    }
371    if(remainingMatchLength_>=0) {
372        out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
373        return 1;
374    }
375    int32_t node=*pos++;
376    if(node>=kMinValueLead) {
377        if(node&kValueIsFinal) {
378            return 0;
379        } else {
380            pos=skipNodeValue(pos, node);
381            node&=kNodeTypeMask;
382        }
383    }
384    if(node<kMinLinearMatch) {
385        if(node==0) {
386            node=*pos++;
387        }
388        out.reserveAppendCapacity(++node);
389        getNextBranchUChars(pos, node, out);
390        return node;
391    } else {
392        // First unit of the linear-match node.
393        out.appendCodeUnit(*pos);
394        return 1;
395    }
396}
397
398void
399UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
400    while(length>kMaxBranchLinearSubNodeLength) {
401        ++pos;  // ignore the comparison unit
402        getNextBranchUChars(jumpByDelta(pos), length>>1, out);
403        length=length-(length>>1);
404        pos=skipDelta(pos);
405    }
406    do {
407        out.appendCodeUnit(*pos++);
408        pos=skipValue(pos);
409    } while(--length>1);
410    out.appendCodeUnit(*pos);
411}
412
413U_NAMESPACE_END
414