1/*
2*******************************************************************************
3*   Copyright (C) 2010-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  bytestrie.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010sep25
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/bytestream.h"
17#include "unicode/bytestrie.h"
18#include "unicode/uobject.h"
19#include "cmemory.h"
20#include "uassert.h"
21
22U_NAMESPACE_BEGIN
23
24BytesTrie::~BytesTrie() {
25    uprv_free(ownedArray_);
26}
27
28// lead byte already shifted right by 1.
29int32_t
30BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
31    int32_t value;
32    if(leadByte<kMinTwoByteValueLead) {
33        value=leadByte-kMinOneByteValueLead;
34    } else if(leadByte<kMinThreeByteValueLead) {
35        value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
36    } else if(leadByte<kFourByteValueLead) {
37        value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
38    } else if(leadByte==kFourByteValueLead) {
39        value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
40    } else {
41        value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
42    }
43    return value;
44}
45
46const uint8_t *
47BytesTrie::jumpByDelta(const uint8_t *pos) {
48    int32_t delta=*pos++;
49    if(delta<kMinTwoByteDeltaLead) {
50        // nothing to do
51    } else if(delta<kMinThreeByteDeltaLead) {
52        delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
53    } else if(delta<kFourByteDeltaLead) {
54        delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
55        pos+=2;
56    } else if(delta==kFourByteDeltaLead) {
57        delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
58        pos+=3;
59    } else {
60        delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
61        pos+=4;
62    }
63    return pos+delta;
64}
65
66UStringTrieResult
67BytesTrie::current() const {
68    const uint8_t *pos=pos_;
69    if(pos==NULL) {
70        return USTRINGTRIE_NO_MATCH;
71    } else {
72        int32_t node;
73        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
74                valueResult(node) : USTRINGTRIE_NO_VALUE;
75    }
76}
77
78UStringTrieResult
79BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
80    // Branch according to the current byte.
81    if(length==0) {
82        length=*pos++;
83    }
84    ++length;
85    // The length of the branch is the number of bytes to select from.
86    // The data structure encodes a binary search.
87    while(length>kMaxBranchLinearSubNodeLength) {
88        if(inByte<*pos++) {
89            length>>=1;
90            pos=jumpByDelta(pos);
91        } else {
92            length=length-(length>>1);
93            pos=skipDelta(pos);
94        }
95    }
96    // Drop down to linear search for the last few bytes.
97    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
98    // and divides length by 2.
99    do {
100        if(inByte==*pos++) {
101            UStringTrieResult result;
102            int32_t node=*pos;
103            U_ASSERT(node>=kMinValueLead);
104            if(node&kValueIsFinal) {
105                // Leave the final value for getValue() to read.
106                result=USTRINGTRIE_FINAL_VALUE;
107            } else {
108                // Use the non-final value as the jump delta.
109                ++pos;
110                // int32_t delta=readValue(pos, node>>1);
111                node>>=1;
112                int32_t delta;
113                if(node<kMinTwoByteValueLead) {
114                    delta=node-kMinOneByteValueLead;
115                } else if(node<kMinThreeByteValueLead) {
116                    delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
117                } else if(node<kFourByteValueLead) {
118                    delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
119                    pos+=2;
120                } else if(node==kFourByteValueLead) {
121                    delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
122                    pos+=3;
123                } else {
124                    delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
125                    pos+=4;
126                }
127                // end readValue()
128                pos+=delta;
129                node=*pos;
130                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
131            }
132            pos_=pos;
133            return result;
134        }
135        --length;
136        pos=skipValue(pos);
137    } while(length>1);
138    if(inByte==*pos++) {
139        pos_=pos;
140        int32_t node=*pos;
141        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
142    } else {
143        stop();
144        return USTRINGTRIE_NO_MATCH;
145    }
146}
147
148UStringTrieResult
149BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
150    for(;;) {
151        int32_t node=*pos++;
152        if(node<kMinLinearMatch) {
153            return branchNext(pos, node, inByte);
154        } else if(node<kMinValueLead) {
155            // Match the first of length+1 bytes.
156            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
157            if(inByte==*pos++) {
158                remainingMatchLength_=--length;
159                pos_=pos;
160                return (length<0 && (node=*pos)>=kMinValueLead) ?
161                        valueResult(node) : USTRINGTRIE_NO_VALUE;
162            } else {
163                // No match.
164                break;
165            }
166        } else if(node&kValueIsFinal) {
167            // No further matching bytes.
168            break;
169        } else {
170            // Skip intermediate value.
171            pos=skipValue(pos, node);
172            // The next node must not also be a value node.
173            U_ASSERT(*pos<kMinValueLead);
174        }
175    }
176    stop();
177    return USTRINGTRIE_NO_MATCH;
178}
179
180UStringTrieResult
181BytesTrie::next(int32_t inByte) {
182    const uint8_t *pos=pos_;
183    if(pos==NULL) {
184        return USTRINGTRIE_NO_MATCH;
185    }
186    if(inByte<0) {
187        inByte+=0x100;
188    }
189    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
190    if(length>=0) {
191        // Remaining part of a linear-match node.
192        if(inByte==*pos++) {
193            remainingMatchLength_=--length;
194            pos_=pos;
195            int32_t node;
196            return (length<0 && (node=*pos)>=kMinValueLead) ?
197                    valueResult(node) : USTRINGTRIE_NO_VALUE;
198        } else {
199            stop();
200            return USTRINGTRIE_NO_MATCH;
201        }
202    }
203    return nextImpl(pos, inByte);
204}
205
206UStringTrieResult
207BytesTrie::next(const char *s, int32_t sLength) {
208    if(sLength<0 ? *s==0 : sLength==0) {
209        // Empty input.
210        return current();
211    }
212    const uint8_t *pos=pos_;
213    if(pos==NULL) {
214        return USTRINGTRIE_NO_MATCH;
215    }
216    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
217    for(;;) {
218        // Fetch the next input byte, if there is one.
219        // Continue a linear-match node without rechecking sLength<0.
220        int32_t inByte;
221        if(sLength<0) {
222            for(;;) {
223                if((inByte=*s++)==0) {
224                    remainingMatchLength_=length;
225                    pos_=pos;
226                    int32_t node;
227                    return (length<0 && (node=*pos)>=kMinValueLead) ?
228                            valueResult(node) : USTRINGTRIE_NO_VALUE;
229                }
230                if(length<0) {
231                    remainingMatchLength_=length;
232                    break;
233                }
234                if(inByte!=*pos) {
235                    stop();
236                    return USTRINGTRIE_NO_MATCH;
237                }
238                ++pos;
239                --length;
240            }
241        } else {
242            for(;;) {
243                if(sLength==0) {
244                    remainingMatchLength_=length;
245                    pos_=pos;
246                    int32_t node;
247                    return (length<0 && (node=*pos)>=kMinValueLead) ?
248                            valueResult(node) : USTRINGTRIE_NO_VALUE;
249                }
250                inByte=*s++;
251                --sLength;
252                if(length<0) {
253                    remainingMatchLength_=length;
254                    break;
255                }
256                if(inByte!=*pos) {
257                    stop();
258                    return USTRINGTRIE_NO_MATCH;
259                }
260                ++pos;
261                --length;
262            }
263        }
264        for(;;) {
265            int32_t node=*pos++;
266            if(node<kMinLinearMatch) {
267                UStringTrieResult result=branchNext(pos, node, inByte);
268                if(result==USTRINGTRIE_NO_MATCH) {
269                    return USTRINGTRIE_NO_MATCH;
270                }
271                // Fetch the next input byte, if there is one.
272                if(sLength<0) {
273                    if((inByte=*s++)==0) {
274                        return result;
275                    }
276                } else {
277                    if(sLength==0) {
278                        return result;
279                    }
280                    inByte=*s++;
281                    --sLength;
282                }
283                if(result==USTRINGTRIE_FINAL_VALUE) {
284                    // No further matching bytes.
285                    stop();
286                    return USTRINGTRIE_NO_MATCH;
287                }
288                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
289            } else if(node<kMinValueLead) {
290                // Match length+1 bytes.
291                length=node-kMinLinearMatch;  // Actual match length minus 1.
292                if(inByte!=*pos) {
293                    stop();
294                    return USTRINGTRIE_NO_MATCH;
295                }
296                ++pos;
297                --length;
298                break;
299            } else if(node&kValueIsFinal) {
300                // No further matching bytes.
301                stop();
302                return USTRINGTRIE_NO_MATCH;
303            } else {
304                // Skip intermediate value.
305                pos=skipValue(pos, node);
306                // The next node must not also be a value node.
307                U_ASSERT(*pos<kMinValueLead);
308            }
309        }
310    }
311}
312
313const uint8_t *
314BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
315                                     UBool haveUniqueValue, int32_t &uniqueValue) {
316    while(length>kMaxBranchLinearSubNodeLength) {
317        ++pos;  // ignore the comparison byte
318        if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
319            return NULL;
320        }
321        length=length-(length>>1);
322        pos=skipDelta(pos);
323    }
324    do {
325        ++pos;  // ignore a comparison byte
326        // handle its value
327        int32_t node=*pos++;
328        UBool isFinal=(UBool)(node&kValueIsFinal);
329        int32_t value=readValue(pos, node>>1);
330        pos=skipValue(pos, node);
331        if(isFinal) {
332            if(haveUniqueValue) {
333                if(value!=uniqueValue) {
334                    return NULL;
335                }
336            } else {
337                uniqueValue=value;
338                haveUniqueValue=TRUE;
339            }
340        } else {
341            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
342                return NULL;
343            }
344            haveUniqueValue=TRUE;
345        }
346    } while(--length>1);
347    return pos+1;  // ignore the last comparison byte
348}
349
350UBool
351BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
352    for(;;) {
353        int32_t node=*pos++;
354        if(node<kMinLinearMatch) {
355            if(node==0) {
356                node=*pos++;
357            }
358            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
359            if(pos==NULL) {
360                return FALSE;
361            }
362            haveUniqueValue=TRUE;
363        } else if(node<kMinValueLead) {
364            // linear-match node
365            pos+=node-kMinLinearMatch+1;  // Ignore the match bytes.
366        } else {
367            UBool isFinal=(UBool)(node&kValueIsFinal);
368            int32_t value=readValue(pos, node>>1);
369            if(haveUniqueValue) {
370                if(value!=uniqueValue) {
371                    return FALSE;
372                }
373            } else {
374                uniqueValue=value;
375                haveUniqueValue=TRUE;
376            }
377            if(isFinal) {
378                return TRUE;
379            }
380            pos=skipValue(pos, node);
381        }
382    }
383}
384
385int32_t
386BytesTrie::getNextBytes(ByteSink &out) const {
387    const uint8_t *pos=pos_;
388    if(pos==NULL) {
389        return 0;
390    }
391    if(remainingMatchLength_>=0) {
392        append(out, *pos);  // Next byte of a pending linear-match node.
393        return 1;
394    }
395    int32_t node=*pos++;
396    if(node>=kMinValueLead) {
397        if(node&kValueIsFinal) {
398            return 0;
399        } else {
400            pos=skipValue(pos, node);
401            node=*pos++;
402            U_ASSERT(node<kMinValueLead);
403        }
404    }
405    if(node<kMinLinearMatch) {
406        if(node==0) {
407            node=*pos++;
408        }
409        getNextBranchBytes(pos, ++node, out);
410        return node;
411    } else {
412        // First byte of the linear-match node.
413        append(out, *pos);
414        return 1;
415    }
416}
417
418void
419BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
420    while(length>kMaxBranchLinearSubNodeLength) {
421        ++pos;  // ignore the comparison byte
422        getNextBranchBytes(jumpByDelta(pos), length>>1, out);
423        length=length-(length>>1);
424        pos=skipDelta(pos);
425    }
426    do {
427        append(out, *pos++);
428        pos=skipValue(pos);
429    } while(--length>1);
430    append(out, *pos);
431}
432
433void
434BytesTrie::append(ByteSink &out, int c) {
435    char ch=(char)c;
436    out.Append(&ch, 1);
437}
438
439U_NAMESPACE_END
440