1/*
2*******************************************************************************
3*   Copyright (C) 2010-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucharstrie.h
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010nov14
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/appendable.h"
17#include "unicode/ucharstrie.h"
18#include "unicode/uobject.h"
19#include "unicode/utf16.h"
20#include "cmemory.h"
21#include "uassert.h"
22
23U_NAMESPACE_BEGIN
24
25UCharsTrie::~UCharsTrie() {
26    uprv_free(ownedArray_);
27}
28
29UStringTrieResult
30UCharsTrie::current() const {
31    const UChar *pos=pos_;
32    if(pos==NULL) {
33        return USTRINGTRIE_NO_MATCH;
34    } else {
35        int32_t node;
36        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
37                valueResult(node) : USTRINGTRIE_NO_VALUE;
38    }
39}
40
41UStringTrieResult
42UCharsTrie::firstForCodePoint(UChar32 cp) {
43    return cp<=0xffff ?
44        first(cp) :
45        (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
46            next(U16_TRAIL(cp)) :
47            USTRINGTRIE_NO_MATCH);
48}
49
50UStringTrieResult
51UCharsTrie::nextForCodePoint(UChar32 cp) {
52    return cp<=0xffff ?
53        next(cp) :
54        (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
55            next(U16_TRAIL(cp)) :
56            USTRINGTRIE_NO_MATCH);
57}
58
59UStringTrieResult
60UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
61    // Branch according to the current unit.
62    if(length==0) {
63        length=*pos++;
64    }
65    ++length;
66    // The length of the branch is the number of units to select from.
67    // The data structure encodes a binary search.
68    while(length>kMaxBranchLinearSubNodeLength) {
69        if(uchar<*pos++) {
70            length>>=1;
71            pos=jumpByDelta(pos);
72        } else {
73            length=length-(length>>1);
74            pos=skipDelta(pos);
75        }
76    }
77    // Drop down to linear search for the last few units.
78    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
79    // and divides length by 2.
80    do {
81        if(uchar==*pos++) {
82            UStringTrieResult result;
83            int32_t node=*pos;
84            if(node&kValueIsFinal) {
85                // Leave the final value for getValue() to read.
86                result=USTRINGTRIE_FINAL_VALUE;
87            } else {
88                // Use the non-final value as the jump delta.
89                ++pos;
90                // int32_t delta=readValue(pos, node);
91                int32_t delta;
92                if(node<kMinTwoUnitValueLead) {
93                    delta=node;
94                } else if(node<kThreeUnitValueLead) {
95                    delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
96                } else {
97                    delta=(pos[0]<<16)|pos[1];
98                    pos+=2;
99                }
100                // end readValue()
101                pos+=delta;
102                node=*pos;
103                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
104            }
105            pos_=pos;
106            return result;
107        }
108        --length;
109        pos=skipValue(pos);
110    } while(length>1);
111    if(uchar==*pos++) {
112        pos_=pos;
113        int32_t node=*pos;
114        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
115    } else {
116        stop();
117        return USTRINGTRIE_NO_MATCH;
118    }
119}
120
121UStringTrieResult
122UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
123    int32_t node=*pos++;
124    for(;;) {
125        if(node<kMinLinearMatch) {
126            return branchNext(pos, node, uchar);
127        } else if(node<kMinValueLead) {
128            // Match the first of length+1 units.
129            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
130            if(uchar==*pos++) {
131                remainingMatchLength_=--length;
132                pos_=pos;
133                return (length<0 && (node=*pos)>=kMinValueLead) ?
134                        valueResult(node) : USTRINGTRIE_NO_VALUE;
135            } else {
136                // No match.
137                break;
138            }
139        } else if(node&kValueIsFinal) {
140            // No further matching units.
141            break;
142        } else {
143            // Skip intermediate value.
144            pos=skipNodeValue(pos, node);
145            node&=kNodeTypeMask;
146        }
147    }
148    stop();
149    return USTRINGTRIE_NO_MATCH;
150}
151
152UStringTrieResult
153UCharsTrie::next(int32_t uchar) {
154    const UChar *pos=pos_;
155    if(pos==NULL) {
156        return USTRINGTRIE_NO_MATCH;
157    }
158    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
159    if(length>=0) {
160        // Remaining part of a linear-match node.
161        if(uchar==*pos++) {
162            remainingMatchLength_=--length;
163            pos_=pos;
164            int32_t node;
165            return (length<0 && (node=*pos)>=kMinValueLead) ?
166                    valueResult(node) : USTRINGTRIE_NO_VALUE;
167        } else {
168            stop();
169            return USTRINGTRIE_NO_MATCH;
170        }
171    }
172    return nextImpl(pos, uchar);
173}
174
175UStringTrieResult
176UCharsTrie::next(const UChar *s, int32_t sLength) {
177    if(sLength<0 ? *s==0 : sLength==0) {
178        // Empty input.
179        return current();
180    }
181    const UChar *pos=pos_;
182    if(pos==NULL) {
183        return USTRINGTRIE_NO_MATCH;
184    }
185    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
186    for(;;) {
187        // Fetch the next input unit, if there is one.
188        // Continue a linear-match node without rechecking sLength<0.
189        int32_t uchar;
190        if(sLength<0) {
191            for(;;) {
192                if((uchar=*s++)==0) {
193                    remainingMatchLength_=length;
194                    pos_=pos;
195                    int32_t node;
196                    return (length<0 && (node=*pos)>=kMinValueLead) ?
197                            valueResult(node) : USTRINGTRIE_NO_VALUE;
198                }
199                if(length<0) {
200                    remainingMatchLength_=length;
201                    break;
202                }
203                if(uchar!=*pos) {
204                    stop();
205                    return USTRINGTRIE_NO_MATCH;
206                }
207                ++pos;
208                --length;
209            }
210        } else {
211            for(;;) {
212                if(sLength==0) {
213                    remainingMatchLength_=length;
214                    pos_=pos;
215                    int32_t node;
216                    return (length<0 && (node=*pos)>=kMinValueLead) ?
217                            valueResult(node) : USTRINGTRIE_NO_VALUE;
218                }
219                uchar=*s++;
220                --sLength;
221                if(length<0) {
222                    remainingMatchLength_=length;
223                    break;
224                }
225                if(uchar!=*pos) {
226                    stop();
227                    return USTRINGTRIE_NO_MATCH;
228                }
229                ++pos;
230                --length;
231            }
232        }
233        int32_t node=*pos++;
234        for(;;) {
235            if(node<kMinLinearMatch) {
236                UStringTrieResult result=branchNext(pos, node, uchar);
237                if(result==USTRINGTRIE_NO_MATCH) {
238                    return USTRINGTRIE_NO_MATCH;
239                }
240                // Fetch the next input unit, if there is one.
241                if(sLength<0) {
242                    if((uchar=*s++)==0) {
243                        return result;
244                    }
245                } else {
246                    if(sLength==0) {
247                        return result;
248                    }
249                    uchar=*s++;
250                    --sLength;
251                }
252                if(result==USTRINGTRIE_FINAL_VALUE) {
253                    // No further matching units.
254                    stop();
255                    return USTRINGTRIE_NO_MATCH;
256                }
257                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
258                node=*pos++;
259            } else if(node<kMinValueLead) {
260                // Match length+1 units.
261                length=node-kMinLinearMatch;  // Actual match length minus 1.
262                if(uchar!=*pos) {
263                    stop();
264                    return USTRINGTRIE_NO_MATCH;
265                }
266                ++pos;
267                --length;
268                break;
269            } else if(node&kValueIsFinal) {
270                // No further matching units.
271                stop();
272                return USTRINGTRIE_NO_MATCH;
273            } else {
274                // Skip intermediate value.
275                pos=skipNodeValue(pos, node);
276                node&=kNodeTypeMask;
277            }
278        }
279    }
280}
281
282const UChar *
283UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
284                                      UBool haveUniqueValue, int32_t &uniqueValue) {
285    while(length>kMaxBranchLinearSubNodeLength) {
286        ++pos;  // ignore the comparison unit
287        if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
288            return NULL;
289        }
290        length=length-(length>>1);
291        pos=skipDelta(pos);
292    }
293    do {
294        ++pos;  // ignore a comparison unit
295        // handle its value
296        int32_t node=*pos++;
297        UBool isFinal=(UBool)(node>>15);
298        node&=0x7fff;
299        int32_t value=readValue(pos, node);
300        pos=skipValue(pos, node);
301        if(isFinal) {
302            if(haveUniqueValue) {
303                if(value!=uniqueValue) {
304                    return NULL;
305                }
306            } else {
307                uniqueValue=value;
308                haveUniqueValue=TRUE;
309            }
310        } else {
311            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
312                return NULL;
313            }
314            haveUniqueValue=TRUE;
315        }
316    } while(--length>1);
317    return pos+1;  // ignore the last comparison unit
318}
319
320UBool
321UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
322    int32_t node=*pos++;
323    for(;;) {
324        if(node<kMinLinearMatch) {
325            if(node==0) {
326                node=*pos++;
327            }
328            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
329            if(pos==NULL) {
330                return FALSE;
331            }
332            haveUniqueValue=TRUE;
333            node=*pos++;
334        } else if(node<kMinValueLead) {
335            // linear-match node
336            pos+=node-kMinLinearMatch+1;  // Ignore the match units.
337            node=*pos++;
338        } else {
339            UBool isFinal=(UBool)(node>>15);
340            int32_t value;
341            if(isFinal) {
342                value=readValue(pos, node&0x7fff);
343            } else {
344                value=readNodeValue(pos, node);
345            }
346            if(haveUniqueValue) {
347                if(value!=uniqueValue) {
348                    return FALSE;
349                }
350            } else {
351                uniqueValue=value;
352                haveUniqueValue=TRUE;
353            }
354            if(isFinal) {
355                return TRUE;
356            }
357            pos=skipNodeValue(pos, node);
358            node&=kNodeTypeMask;
359        }
360    }
361}
362
363int32_t
364UCharsTrie::getNextUChars(Appendable &out) const {
365    const UChar *pos=pos_;
366    if(pos==NULL) {
367        return 0;
368    }
369    if(remainingMatchLength_>=0) {
370        out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
371        return 1;
372    }
373    int32_t node=*pos++;
374    if(node>=kMinValueLead) {
375        if(node&kValueIsFinal) {
376            return 0;
377        } else {
378            pos=skipNodeValue(pos, node);
379            node&=kNodeTypeMask;
380        }
381    }
382    if(node<kMinLinearMatch) {
383        if(node==0) {
384            node=*pos++;
385        }
386        out.reserveAppendCapacity(++node);
387        getNextBranchUChars(pos, node, out);
388        return node;
389    } else {
390        // First unit of the linear-match node.
391        out.appendCodeUnit(*pos);
392        return 1;
393    }
394}
395
396void
397UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
398    while(length>kMaxBranchLinearSubNodeLength) {
399        ++pos;  // ignore the comparison unit
400        getNextBranchUChars(jumpByDelta(pos), length>>1, out);
401        length=length-(length>>1);
402        pos=skipDelta(pos);
403    }
404    do {
405        out.appendCodeUnit(*pos++);
406        pos=skipValue(pos);
407    } while(--length>1);
408    out.appendCodeUnit(*pos);
409}
410
411U_NAMESPACE_END
412