1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2009, International Business Machines Corporation and         *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.impl;
10
11/**
12 * @author aheninger
13 *
14 * A Trie2Writable is a modifiable, or build-time Trie2.
15 * Functions for reading data from the Trie are all from class Trie2.
16 *
17 */
18public class Trie2Writable extends Trie2 {
19
20
21    /**
22     * Create a new, empty, writable Trie2. 32-bit data values are used.
23     *
24     * @param initialValueP the initial value that is set for all code points
25     * @param errorValueP the value for out-of-range code points and illegal UTF-8
26     */
27    public  Trie2Writable(int initialValueP, int errorValueP) {
28        // This constructor corresponds to utrie2_open() in ICU4C.
29        init(initialValueP, errorValueP);
30    }
31
32
33    private void init(int initialValueP, int errorValueP) {
34        this.initialValue = initialValueP;
35        this.errorValue   = errorValueP;
36        this.highStart    = 0x110000;
37
38        this.data           = new int[UNEWTRIE2_INITIAL_DATA_LENGTH];
39        this.dataCapacity   = UNEWTRIE2_INITIAL_DATA_LENGTH;
40        this.initialValue   = initialValueP;
41        this.errorValue     = errorValueP;
42        this.highStart      = 0x110000;
43        this.firstFreeBlock = 0;  /* no free block in the list */
44        this.isCompacted    = false;
45
46        /*
47         * preallocate and reset
48         * - ASCII
49         * - the bad-UTF-8-data block
50         * - the null data block
51         */
52        int i, j;
53        for(i=0; i<0x80; ++i) {
54            data[i] = initialValue;
55        }
56        for(; i<0xc0; ++i) {
57            data[i] = errorValue;
58        }
59        for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
60            data[i] = initialValue;
61        }
62        dataNullOffset = UNEWTRIE2_DATA_NULL_OFFSET;
63        dataLength     = UNEWTRIE2_DATA_START_OFFSET;
64
65        /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
66        for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
67            index2[i]=j;
68            map[i]=1;
69        }
70
71        /* reference counts for the bad-UTF-8-data block */
72        for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
73            map[i]=0;
74        }
75
76        /*
77         * Reference counts for the null data block: all blocks except for the ASCII blocks.
78         * Plus 1 so that we don't drop this block during compaction.
79         * Plus as many as needed for lead surrogate code points.
80         */
81        /* i==newTrie->dataNullOffset */
82        map[i++] =
83            (0x110000>>UTRIE2_SHIFT_2) -
84            (0x80>>UTRIE2_SHIFT_2) +
85            1 +
86            UTRIE2_LSCP_INDEX_2_LENGTH;
87        j += UTRIE2_DATA_BLOCK_LENGTH;
88        for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
89            map[i]=0;
90        }
91
92        /*
93         * set the remaining indexes in the BMP index-2 block
94         * to the null data block
95         */
96        for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
97            index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
98        }
99
100        /*
101         * Fill the index gap with impossible values so that compaction
102         * does not overlap other index-2 blocks with the gap.
103         */
104        for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
105            index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
106        }
107
108        /* set the indexes in the null index-2 block */
109        for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
110            index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
111        }
112        index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
113        index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
114
115        /* set the index-1 indexes for the linear index-2 block */
116        for(i=0, j=0;
117            i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
118            ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
119        ) {
120            index1[i]=j;
121        }
122
123        /* set the remaining index-1 indexes to the null index-2 block */
124        for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
125            index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
126        }
127
128        /*
129         * Preallocate and reset data for U+0080..U+07ff,
130         * for 2-byte UTF-8 which will be compacted in 64-blocks
131         * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
132         */
133        for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
134            set(i, initialValue);
135        }
136
137    }
138
139
140    /**
141     * Create a new build time (modifiable) Trie2 whose contents are the same as the source Trie2.
142     *
143     * @param source the source Trie2.  Its contents will be copied into the new Trie2.
144     */
145    public Trie2Writable(Trie2 source) {
146        init(source.initialValue, source.errorValue);
147
148        for (Range r: source) {
149            setRange(r, true);
150        }
151    }
152
153
154    private boolean isInNullBlock(int c, boolean forLSCP) {
155        int i2, block;
156
157        if(Character.isHighSurrogate((char)c) && forLSCP) {
158            i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
159                (c>>UTRIE2_SHIFT_2);
160        } else {
161            i2=index1[c>>UTRIE2_SHIFT_1]+
162                ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
163        }
164        block=index2[i2];
165        return (block==dataNullOffset);
166    }
167
168    private int allocIndex2Block() {
169        int newBlock, newTop;
170
171        newBlock=index2Length;
172        newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
173        if(newTop > index2.length) {
174            throw new IllegalStateException("Internal error in Trie2 creation.");
175            /*
176             * Should never occur.
177             * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
178             * or the code writes more values than should be possible.
179             */
180        }
181        index2Length=newTop;
182        System.arraycopy(index2, index2NullOffset, index2, newBlock, UTRIE2_INDEX_2_BLOCK_LENGTH);
183        return newBlock;
184    }
185
186    private int getIndex2Block(int c, boolean forLSCP) {
187        int i1, i2;
188
189        if(c>=0xd800 && c<0xdc00 && forLSCP) {
190            return UTRIE2_LSCP_INDEX_2_OFFSET;
191        }
192
193        i1=c>>UTRIE2_SHIFT_1;
194        i2=index1[i1];
195        if(i2==index2NullOffset) {
196            i2=allocIndex2Block();
197            index1[i1]=i2;
198        }
199        return i2;
200    }
201
202    private int allocDataBlock(int copyBlock) {
203        int newBlock, newTop;
204
205        if(firstFreeBlock!=0) {
206            /* get the first free block */
207            newBlock=firstFreeBlock;
208            firstFreeBlock=-map[newBlock>>UTRIE2_SHIFT_2];
209        } else {
210            /* get a new block from the high end */
211            newBlock=dataLength;
212            newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
213            if(newTop>dataCapacity) {
214                /* out of memory in the data array */
215                int capacity;
216                int[] newData;
217
218                if(dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
219                    capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
220                } else if(dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
221                    capacity=UNEWTRIE2_MAX_DATA_LENGTH;
222                } else {
223                    /*
224                     * Should never occur.
225                     * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
226                     * or the code writes more values than should be possible.
227                     */
228                    throw new IllegalStateException("Internal error in Trie2 creation.");
229                }
230                newData = new int[capacity];
231                System.arraycopy(data, 0, newData, 0, dataLength);
232                data=newData;
233                dataCapacity=capacity;
234            }
235            dataLength=newTop;
236        }
237        System.arraycopy(data, copyBlock, data, newBlock, UTRIE2_DATA_BLOCK_LENGTH);
238        map[newBlock>>UTRIE2_SHIFT_2]=0;
239        return newBlock;
240    }
241
242
243    /* call when the block's reference counter reaches 0 */
244    private void releaseDataBlock(int block) {
245        /* put this block at the front of the free-block chain */
246        map[block>>UTRIE2_SHIFT_2]=-firstFreeBlock;
247        firstFreeBlock=block;
248    }
249
250
251    private boolean isWritableBlock(int block) {
252        return (block!=dataNullOffset && 1==map[block>>UTRIE2_SHIFT_2]);
253    }
254
255    private void setIndex2Entry(int i2, int block) {
256        int oldBlock;
257        ++map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
258        oldBlock=index2[i2];
259        if(0 == --map[oldBlock>>UTRIE2_SHIFT_2]) {
260            releaseDataBlock(oldBlock);
261        }
262        index2[i2]=block;
263    }
264
265
266    /**
267     * No error checking for illegal arguments.
268     *
269     * @internal
270     */
271    private int getDataBlock(int c, boolean forLSCP) {
272        int i2, oldBlock, newBlock;
273
274        i2=getIndex2Block(c, forLSCP);
275
276        i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
277        oldBlock=index2[i2];
278        if(isWritableBlock(oldBlock)) {
279            return oldBlock;
280        }
281
282        /* allocate a new data block */
283        newBlock=allocDataBlock(oldBlock);
284        setIndex2Entry(i2, newBlock);
285        return newBlock;
286    }
287    /**
288     * Set a value for a code point.
289     *
290     * @param c the code point
291     * @param value the value
292     */
293    public Trie2Writable set(int c, int value) {
294        if (c<0 || c>0x10ffff) {
295            throw new IllegalArgumentException("Invalid code point.");
296        }
297        set(c, true, value);
298        fHash = 0;
299        return this;
300    }
301
302    private Trie2Writable set(int c, boolean forLSCP, int value) {
303        int block;
304        if (isCompacted) {
305            uncompact();
306        }
307        block = getDataBlock(c, forLSCP);
308        data[block + (c&UTRIE2_DATA_MASK)] = value;
309        return this;
310    }
311
312
313    /*
314     * Uncompact a compacted Trie2Writable.
315     * This is needed if a the WritableTrie2 was compacted in preparation for creating a read-only
316     * Trie2, and then is subsequently altered.
317     *
318     * The structure is a bit awkward - it would be cleaner to leave the original
319     * Trie2 unaltered - but compacting in place was taken directly from the ICU4C code.
320     *
321     * The approach is to create a new (uncompacted) Trie2Writable from this one, then transfer
322     * the guts from the new to the old.
323     */
324    private void uncompact() {
325        Trie2Writable tempTrie = new Trie2Writable(this);
326
327        // Members from Trie2Writable
328        this.index1       = tempTrie.index1;
329        this.index2       = tempTrie.index2;
330        this.data         = tempTrie.data;
331        this.index2Length = tempTrie.index2Length;
332        this.dataCapacity = tempTrie.dataCapacity;
333        this.isCompacted  = tempTrie.isCompacted;
334
335        // Members From Trie2
336        this.header           = tempTrie.header;
337        this.index            = tempTrie.index;
338        this.data16           = tempTrie.data16;
339        this.data32           = tempTrie.data32;
340        this.indexLength      = tempTrie.indexLength;
341        this.dataLength       = tempTrie.dataLength;
342        this.index2NullOffset = tempTrie.index2NullOffset;
343        this.initialValue     = tempTrie.initialValue;
344        this.errorValue       = tempTrie.errorValue;
345        this.highStart        = tempTrie.highStart;
346        this.highValueIndex   = tempTrie.highValueIndex;
347        this.dataNullOffset   = tempTrie.dataNullOffset;
348    }
349
350
351    private void writeBlock(int  block, int value) {
352        int  limit=block+UTRIE2_DATA_BLOCK_LENGTH;
353        while(block<limit) {
354            data[block++]=value;
355        }
356    }
357
358    /**
359     * initialValue is ignored if overwrite=TRUE
360     * @internal
361     */
362    private void fillBlock(int block, /*UChar32*/ int start, /*UChar32*/ int limit,
363              int value, int initialValue, boolean overwrite) {
364        int i;
365        int pLimit = block+limit;
366        if(overwrite) {
367            for (i=block+start; i<pLimit; i++) {
368                data[i] = value;
369            }
370        } else {
371            for (i=block+start; i<pLimit; i++) {
372                if(data[i]==initialValue) {
373                    data[i]=value;
374                }
375            }
376        }
377    }
378
379    /**
380     * Set a value in a range of code points [start..end].
381     * All code points c with start<=c<=end will get the value if
382     * overwrite is TRUE or if the old value is the initial value.
383     *
384     * @param start the first code point to get the value
385     * @param end the last code point to get the value (inclusive)
386     * @param value the value
387     * @param overwrite flag for whether old non-initial values are to be overwritten
388     */
389     public Trie2Writable setRange(int start, int end,
390                           int value, boolean overwrite) {
391         /*
392          * repeat value in [start..end]
393          * mark index values for repeat-data blocks by setting bit 31 of the index values
394          * fill around existing values if any, if(overwrite)
395          */
396         int block, rest, repeatBlock;
397         int /*UChar32*/ limit;
398
399         if(start>0x10ffff || start<0 || end>0x10ffff || end<0 || start>end) {
400             throw new IllegalArgumentException("Invalid code point range.");
401         }
402         if(!overwrite && value==initialValue) {
403             return this; /* nothing to do */
404         }
405         fHash = 0;
406         if(isCompacted) {
407             this.uncompact();
408         }
409
410         limit=end+1;
411         if((start&UTRIE2_DATA_MASK) != 0) {
412             int  /*UChar32*/ nextStart;
413
414             /* set partial block at [start..following block boundary[ */
415             block=getDataBlock(start, true);
416
417             nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
418             if(nextStart<=limit) {
419                 fillBlock(block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
420                           value, initialValue, overwrite);
421                 start=nextStart;
422             } else {
423                 fillBlock(block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
424                           value, initialValue, overwrite);
425                 return this;
426             }
427         }
428
429         /* number of positions in the last, partial block */
430         rest=limit&UTRIE2_DATA_MASK;
431
432         /* round down limit to a block boundary */
433         limit&=~UTRIE2_DATA_MASK;
434
435         /* iterate over all-value blocks */
436         if(value==initialValue) {
437             repeatBlock=dataNullOffset;
438         } else {
439             repeatBlock=-1;
440         }
441
442         while(start<limit) {
443             int i2;
444             boolean setRepeatBlock=false;
445
446             if(value==initialValue && isInNullBlock(start, true)) {
447                 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
448                 continue;
449             }
450
451             /* get index value */
452             i2=getIndex2Block(start, true);
453             i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
454             block=index2[i2];
455             if(isWritableBlock(block)) {
456                 /* already allocated */
457                 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
458                     /*
459                      * We overwrite all values, and it's not a
460                      * protected (ASCII-linear or 2-byte UTF-8) block:
461                      * replace with the repeatBlock.
462                      */
463                     setRepeatBlock=true;
464                 } else {
465                     /* !overwrite, or protected block: just write the values into this block */
466                     fillBlock(block,
467                               0, UTRIE2_DATA_BLOCK_LENGTH,
468                               value, initialValue, overwrite);
469                 }
470             } else if(data[block]!=value && (overwrite || block==dataNullOffset)) {
471                 /*
472                  * Set the repeatBlock instead of the null block or previous repeat block:
473                  *
474                  * If !isWritableBlock() then all entries in the block have the same value
475                  * because it's the null block or a range block (the repeatBlock from a previous
476                  * call to utrie2_setRange32()).
477                  * No other blocks are used multiple times before compacting.
478                  *
479                  * The null block is the only non-writable block with the initialValue because
480                  * of the repeatBlock initialization above. (If value==initialValue, then
481                  * the repeatBlock will be the null data block.)
482                  *
483                  * We set our repeatBlock if the desired value differs from the block's value,
484                  * and if we overwrite any data or if the data is all initial values
485                  * (which is the same as the block being the null block, see above).
486                  */
487                 setRepeatBlock=true;
488             }
489             if(setRepeatBlock) {
490                 if(repeatBlock>=0) {
491                     setIndex2Entry(i2, repeatBlock);
492                 } else {
493                     /* create and set and fill the repeatBlock */
494                     repeatBlock=getDataBlock(start, true);
495                     writeBlock(repeatBlock, value);
496                 }
497             }
498
499             start+=UTRIE2_DATA_BLOCK_LENGTH;
500         }
501
502         if(rest>0) {
503             /* set partial block at [last block boundary..limit[ */
504             block=getDataBlock(start, true);
505             fillBlock(block, 0, rest, value, initialValue, overwrite);
506         }
507
508         return this;
509     }
510
511     /**
512      * Set the values from a Trie2.Range.
513      *
514      * All code points within the range will get the value if
515      * overwrite is TRUE or if the old value is the initial value.
516      *
517      * Ranges with the lead surrogate flag set will set the alternate
518      * lead-surrogate values in the Trie, rather than the code point values.
519      *
520      * This function is intended to work with the ranges produced when iterating
521      * the contents of a source Trie.
522      *
523      * @param range contains the range of code points and the value to be set.
524      * @param overwrite flag for whether old non-initial values are to be overwritten
525      */
526      public Trie2Writable setRange(Trie2.Range range, boolean overwrite) {
527          fHash = 0;
528          if (range.leadSurrogate) {
529              for (int c=range.startCodePoint; c<=range.endCodePoint; c++) {
530                  if (overwrite || getFromU16SingleLead((char)c) == this.initialValue)  {
531                      setForLeadSurrogateCodeUnit((char)c, range.value);
532                  }
533              }
534          } else {
535              setRange(range.startCodePoint, range.endCodePoint, range.value, overwrite);
536          }
537          return this;
538      }
539
540     /**
541      * Set a value for a UTF-16 code unit.
542      * Note that a Trie2 stores separate values for
543      * supplementary code points in the lead surrogate range
544      * (accessed via the plain set() and get() interfaces)
545      * and for lead surrogate code units.
546      *
547      * The lead surrogate code unit values are set via this function and
548      * read by the function getFromU16SingleLead().
549      *
550      * For code units outside of the lead surrogate range, this function
551      * behaves identically to set().
552      *
553      * @param codeUnit A UTF-16 code unit.
554      * @param value the value to be stored in the Trie2.
555      */
556     public Trie2Writable setForLeadSurrogateCodeUnit(char codeUnit, int value) {
557         fHash = 0;
558         set(codeUnit, false, value);
559         return this;
560     }
561
562
563     /**
564      * Get the value for a code point as stored in the Trie2.
565      *
566      * @param codePoint the code point
567      * @return the value
568      */
569    @Override
570    public int get(int codePoint) {
571        if (codePoint<0 || codePoint>0x10ffff) {
572            return errorValue;
573        } else {
574            return get(codePoint, true);
575        }
576    }
577
578
579    private int get(int c, boolean fromLSCP) {
580        int i2, block;
581
582        if(c>=highStart && (!(c>=0xd800 && c<0xdc00) || fromLSCP)) {
583            return data[dataLength-UTRIE2_DATA_GRANULARITY];
584        }
585
586        if((c>=0xd800 && c<0xdc00) && fromLSCP) {
587            i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
588                (c>>UTRIE2_SHIFT_2);
589        } else {
590            i2=index1[c>>UTRIE2_SHIFT_1]+
591                ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
592        }
593        block=index2[i2];
594        return data[block+(c&UTRIE2_DATA_MASK)];
595    }
596
597    /**
598     * Get a trie value for a UTF-16 code unit.
599     *
600     * This function returns the same value as get() if the input
601     * character is outside of the lead surrogate range
602     *
603     * There are two values stored in a Trie for inputs in the lead
604     * surrogate range.  This function returns the alternate value,
605     * while Trie2.get() returns the main value.
606     *
607     * @param c the code point or lead surrogate value.
608     * @return the value
609     */
610    @Override
611    public int getFromU16SingleLead(char c) {
612        return get(c, false);
613    }
614
615    /* compaction --------------------------------------------------------------- */
616
617    private boolean equal_int(int[] a, int s, int t, int length) {
618        for (int i=0; i<length; i++) {
619            if (a[s+i] != a[t+i]) {
620                return false;
621            }
622        }
623        return true;
624    }
625
626
627    private int findSameIndex2Block(int index2Length, int otherBlock) {
628        int block;
629
630        /* ensure that we do not even partially get past index2Length */
631        index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
632
633        for(block=0; block<=index2Length; ++block) {
634            if(equal_int(index2, block, otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
635                return block;
636            }
637        }
638        return -1;
639    }
640
641
642    private int findSameDataBlock(int dataLength, int otherBlock, int blockLength) {
643        int block;
644
645        /* ensure that we do not even partially get past dataLength */
646        dataLength-=blockLength;
647
648        for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
649            if(equal_int(data, block, otherBlock, blockLength)) {
650                return block;
651            }
652        }
653        return -1;
654    }
655
656    /*
657     * Find the start of the last range in the trie by enumerating backward.
658     * Indexes for supplementary code points higher than this will be omitted.
659     */
660    private int findHighStart(int highValue) {
661
662        int value;
663        int c, prev;
664        int i1, i2, j, i2Block, prevI2Block, block, prevBlock;
665
666
667        /* set variables for previous range */
668        if(highValue==initialValue) {
669            prevI2Block=index2NullOffset;
670            prevBlock=dataNullOffset;
671        } else {
672            prevI2Block=-1;
673            prevBlock=-1;
674        }
675        prev=0x110000;
676
677        /* enumerate index-2 blocks */
678        i1=UNEWTRIE2_INDEX_1_LENGTH;
679        c=prev;
680        while(c>0) {
681            i2Block=index1[--i1];
682            if(i2Block==prevI2Block) {
683                /* the index-2 block is the same as the previous one, and filled with highValue */
684                c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
685                continue;
686            }
687            prevI2Block=i2Block;
688            if(i2Block==index2NullOffset) {
689                /* this is the null index-2 block */
690                if(highValue!=initialValue) {
691                    return c;
692                }
693                c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
694            } else {
695                /* enumerate data blocks for one index-2 block */
696                for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
697                    block=index2[i2Block+ --i2];
698                    if(block==prevBlock) {
699                        /* the block is the same as the previous one, and filled with highValue */
700                        c-=UTRIE2_DATA_BLOCK_LENGTH;
701                        continue;
702                    }
703                    prevBlock=block;
704                    if(block==dataNullOffset) {
705                        /* this is the null data block */
706                        if(highValue!=initialValue) {
707                            return c;
708                        }
709                        c-=UTRIE2_DATA_BLOCK_LENGTH;
710                    } else {
711                        for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
712                            value=data[block+ --j];
713                            if(value!=highValue) {
714                                return c;
715                            }
716                            --c;
717                        }
718                    }
719                }
720            }
721        }
722
723        /* deliver last range */
724        return 0;
725    }
726
727    /*
728     * Compact a build-time trie.
729     *
730     * The compaction
731     * - removes blocks that are identical with earlier ones
732     * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
733     * - moves blocks in steps of the data granularity
734     * - moves and overlaps blocks that overlap with multiple values in the overlap region
735     *
736     * It does not
737     * - try to move and overlap blocks that are not already adjacent
738     */
739    private void compactData() {
740        int start, newStart, movedStart;
741        int blockLength, overlap;
742        int i, mapIndex, blockCount;
743
744        /* do not compact linear-ASCII data */
745        newStart=UTRIE2_DATA_START_OFFSET;
746        for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
747            map[i]=start;
748        }
749
750        /*
751         * Start with a block length of 64 for 2-byte UTF-8,
752         * then switch to UTRIE2_DATA_BLOCK_LENGTH.
753         */
754        blockLength=64;
755        blockCount=blockLength>>UTRIE2_SHIFT_2;
756        for(start=newStart; start<dataLength;) {
757            /*
758             * start: index of first entry of current block
759             * newStart: index where the current block is to be moved
760             *           (right after current end of already-compacted data)
761             */
762            if(start==UNEWTRIE2_DATA_0800_OFFSET) {
763                blockLength=UTRIE2_DATA_BLOCK_LENGTH;
764                blockCount=1;
765            }
766
767            /* skip blocks that are not used */
768            if(map[start>>UTRIE2_SHIFT_2]<=0) {
769                /* advance start to the next block */
770                start+=blockLength;
771
772                /* leave newStart with the previous block! */
773                continue;
774            }
775
776            /* search for an identical block */
777            movedStart=findSameDataBlock(newStart, start, blockLength);
778            if(movedStart >= 0) {
779                /* found an identical block, set the other block's index value for the current block */
780                for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
781                    map[mapIndex++]=movedStart;
782                    movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
783                }
784
785                /* advance start to the next block */
786                start+=blockLength;
787
788                /* leave newStart with the previous block! */
789                continue;
790            }
791
792            /* see if the beginning of this block can be overlapped with the end of the previous block */
793            /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
794            for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
795                overlap>0 && !equal_int(data, (newStart-overlap), start, overlap);
796                overlap-=UTRIE2_DATA_GRANULARITY) {}
797
798            if(overlap>0 || newStart<start) {
799                /* some overlap, or just move the whole block */
800                movedStart=newStart-overlap;
801                for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
802                    map[mapIndex++]=movedStart;
803                    movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
804                }
805
806                /* move the non-overlapping indexes to their new positions */
807                start+=overlap;
808                for(i=blockLength-overlap; i>0; --i) {
809                    data[newStart++]=data[start++];
810                }
811            } else /* no overlap && newStart==start */ {
812                for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
813                    map[mapIndex++]=start;
814                    start+=UTRIE2_DATA_BLOCK_LENGTH;
815                }
816                newStart=start;
817            }
818        }
819
820        /* now adjust the index-2 table */
821        for(i=0; i<index2Length; ++i) {
822            if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
823                /* Gap indexes are invalid (-1). Skip over the gap. */
824                i+=UNEWTRIE2_INDEX_GAP_LENGTH;
825            }
826            index2[i]=map[index2[i]>>UTRIE2_SHIFT_2];
827        }
828        dataNullOffset=map[dataNullOffset>>UTRIE2_SHIFT_2];
829
830        /* ensure dataLength alignment */
831        while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
832            data[newStart++]=initialValue;
833        }
834
835        if  (UTRIE2_DEBUG) {
836            /* we saved some space */
837            System.out.printf("compacting UTrie2: count of 32-bit data words %d->%d%n",
838                dataLength, newStart);
839        }
840
841        dataLength=newStart;
842    }
843
844    private void compactIndex2() {
845        int i, start, newStart, movedStart, overlap;
846
847        /* do not compact linear-BMP index-2 blocks */
848        newStart=UTRIE2_INDEX_2_BMP_LENGTH;
849        for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
850            map[i]=start;
851        }
852
853        /* Reduce the index table gap to what will be needed at runtime. */
854        newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((highStart-0x10000)>>UTRIE2_SHIFT_1);
855
856        for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<index2Length;) {
857            /*
858             * start: index of first entry of current block
859             * newStart: index where the current block is to be moved
860             *           (right after current end of already-compacted data)
861             */
862
863            /* search for an identical block */
864            if( (movedStart=findSameIndex2Block(newStart, start))
865                 >=0
866            ) {
867                /* found an identical block, set the other block's index value for the current block */
868                map[start>>UTRIE2_SHIFT_1_2]=movedStart;
869
870                /* advance start to the next block */
871                start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
872
873                /* leave newStart with the previous block! */
874                continue;
875            }
876
877            /* see if the beginning of this block can be overlapped with the end of the previous block */
878            /* look for maximum overlap with the previous, adjacent block */
879            for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
880                overlap>0 && !equal_int(index2, newStart-overlap, start, overlap);
881                --overlap) {}
882
883            if(overlap>0 || newStart<start) {
884                /* some overlap, or just move the whole block */
885                map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
886
887                /* move the non-overlapping indexes to their new positions */
888                start+=overlap;
889                for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
890                    index2[newStart++]=index2[start++];
891                }
892            } else /* no overlap && newStart==start */ {
893                map[start>>UTRIE2_SHIFT_1_2]=start;
894                start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
895                newStart=start;
896            }
897        }
898
899        /* now adjust the index-1 table */
900        for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
901            index1[i]=map[index1[i]>>UTRIE2_SHIFT_1_2];
902        }
903        index2NullOffset=map[index2NullOffset>>UTRIE2_SHIFT_1_2];
904
905        /*
906         * Ensure data table alignment:
907         * Needs to be granularity-aligned for 16-bit trie
908         * (so that dataMove will be down-shiftable),
909         * and 2-aligned for uint32_t data.
910         */
911        while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
912            /* Arbitrary value: 0x3fffc not possible for real data. */
913            index2[newStart++]=0x0000ffff<<UTRIE2_INDEX_SHIFT;
914        }
915
916        if (UTRIE2_DEBUG) {
917            /* we saved some space */
918            System.out.printf("compacting UTrie2: count of 16-bit index-2 words %d->%d%n",
919                    index2Length, newStart);
920        }
921
922        index2Length=newStart;
923    }
924
925    private void compactTrie() {
926        int localHighStart;
927        int suppHighStart;
928        int highValue;
929
930        /* find highStart and round it up */
931        highValue=get(0x10ffff);
932        localHighStart=findHighStart(highValue);
933        localHighStart=(localHighStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
934        if(localHighStart==0x110000) {
935            highValue=errorValue;
936        }
937
938        /*
939         * Set trie->highStart only after utrie2_get32(trie, highStart).
940         * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
941         */
942        this.highStart=localHighStart;
943
944        if (UTRIE2_DEBUG) {
945            System.out.printf("UTrie2: highStart U+%04x  highValue 0x%x  initialValue 0x%x%n",
946                highStart, highValue, initialValue);
947        }
948
949        if(highStart<0x110000) {
950            /* Blank out [highStart..10ffff] to release associated data blocks. */
951            suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
952            setRange(suppHighStart, 0x10ffff, initialValue, true);
953        }
954
955        compactData();
956        if(highStart>0x10000) {
957            compactIndex2();
958        } else {
959            if (UTRIE2_DEBUG) {
960                 System.out.printf("UTrie2: highStart U+%04x  count of 16-bit index-2 words %d->%d%n",
961                         highStart, index2Length, UTRIE2_INDEX_1_OFFSET);
962            }
963        }
964
965        /*
966         * Store the highValue in the data array and round up the dataLength.
967         * Must be done after compactData() because that assumes that dataLength
968         * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
969         */
970        data[dataLength++]=highValue;
971        while((dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
972            data[dataLength++]=initialValue;
973        }
974
975        isCompacted=true;
976    }
977
978
979    /**
980     * Produce an optimized, read-only Trie2_16 from this writable Trie.
981     * The data values outside of the range that will fit in a 16 bit
982     * unsigned value will be truncated.
983     */
984    public Trie2_16 toTrie2_16() {
985        Trie2_16 frozenTrie = new Trie2_16();
986        freeze(frozenTrie, ValueWidth.BITS_16);
987        return frozenTrie;
988    }
989
990
991    /**
992     * Produce an optimized, read-only Trie2_32 from this writable Trie.
993     *
994     */
995    public Trie2_32 toTrie2_32() {
996        Trie2_32 frozenTrie = new Trie2_32();
997        freeze(frozenTrie, ValueWidth.BITS_32);
998        return frozenTrie;
999    }
1000
1001
1002    /**
1003     * Maximum length of the runtime index array.
1004     * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1005     * (The actual maximum length is lower,
1006     * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1007     */
1008    private static final int UTRIE2_MAX_INDEX_LENGTH = 0xffff;
1009
1010    /**
1011     * Maximum length of the runtime data array.
1012     * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1013     * and by uint16_t UTrie2Header.shiftedDataLength.
1014     */
1015    private static final int  UTRIE2_MAX_DATA_LENGTH = 0xffff<<UTRIE2_INDEX_SHIFT;
1016
1017    /* Compact the data and then populate an optimized read-only Trie.  */
1018    private void freeze(Trie2 dest, ValueWidth valueBits) {
1019        int  i;
1020        int  allIndexesLength;
1021        int  dataMove;  /* >0 if the data is moved to the end of the index array */
1022
1023
1024        /* compact if necessary */
1025        if(!isCompacted) {
1026            compactTrie();
1027        }
1028
1029        if(highStart<=0x10000) {
1030            allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1031        } else {
1032            allIndexesLength=index2Length;
1033        }
1034        if(valueBits==ValueWidth.BITS_16) {
1035            dataMove=allIndexesLength;
1036        } else {
1037            dataMove=0;
1038        }
1039
1040        /* are indexLength and dataLength within limits? */
1041        if( /* for unshifted indexLength */
1042            allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1043            /* for unshifted dataNullOffset */
1044            (dataMove+dataNullOffset)>0xffff ||
1045            /* for unshifted 2-byte UTF-8 index-2 values */
1046            (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1047            /* for shiftedDataLength */
1048            (dataMove+dataLength)>UTRIE2_MAX_DATA_LENGTH) {
1049                throw new UnsupportedOperationException("Trie2 data is too large.");
1050        }
1051
1052        /* calculate the sizes of, and allocate, the index and data arrays */
1053        int indexLength = allIndexesLength;
1054        if (valueBits==ValueWidth.BITS_16) {
1055            indexLength += dataLength;
1056        } else {
1057            dest.data32 = new int[dataLength];
1058        }
1059        dest.index = new char[indexLength];
1060
1061        dest.indexLength = allIndexesLength;
1062        dest.dataLength  = dataLength;
1063        if(highStart<=0x10000) {
1064            dest.index2NullOffset = 0xffff;
1065        } else {
1066            dest.index2NullOffset = UTRIE2_INDEX_2_OFFSET + index2NullOffset;
1067        }
1068        dest.initialValue   = initialValue;
1069        dest.errorValue     = errorValue;
1070        dest.highStart      = highStart;
1071        dest.highValueIndex = dataMove + dataLength - UTRIE2_DATA_GRANULARITY;
1072        dest.dataNullOffset = (dataMove+dataNullOffset);
1073
1074        // Create a header and set the its fields.
1075        //   (This is only used in the event that we serialize the Trie, but is
1076        //    convenient to do here.)
1077        dest.header = new Trie2.UTrie2Header();
1078        dest.header.signature         = 0x54726932; /* "Tri2" */
1079        dest.header.options           = valueBits==ValueWidth.BITS_16 ? 0 : 1;
1080        dest.header.indexLength       = dest.indexLength;
1081        dest.header.shiftedDataLength = dest.dataLength>>UTRIE2_INDEX_SHIFT;
1082        dest.header.index2NullOffset  = dest.index2NullOffset;
1083        dest.header.dataNullOffset    = dest.dataNullOffset;
1084        dest.header.shiftedHighStart  = dest.highStart>>UTRIE2_SHIFT_1;
1085
1086
1087
1088        /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1089        int destIdx = 0;
1090        for(i=0; i<UTRIE2_INDEX_2_BMP_LENGTH; i++) {
1091            dest.index[destIdx++] = (char)((index2[i]+dataMove) >> UTRIE2_INDEX_SHIFT);
1092        }
1093        if (UTRIE2_DEBUG) {
1094            System.out.println("\n\nIndex2 for BMP limit is " + Integer.toHexString(destIdx));
1095        }
1096
1097        /* write UTF-8 2-byte index-2 values, not right-shifted */
1098        for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
1099            dest.index[destIdx++] = (char)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1100        }
1101        for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
1102            dest.index[destIdx++]=(char)(dataMove+index2[i<<(6-UTRIE2_SHIFT_2)]);
1103        }
1104        if (UTRIE2_DEBUG) {
1105            System.out.println("Index2 for UTF-8 2byte values limit is " + Integer.toHexString(destIdx));
1106        }
1107
1108        if(highStart>0x10000) {
1109            int index1Length = (highStart-0x10000)>>UTRIE2_SHIFT_1;
1110            int index2Offset = UTRIE2_INDEX_2_BMP_LENGTH + UTRIE2_UTF8_2B_INDEX_2_LENGTH + index1Length;
1111
1112            /* write 16-bit index-1 values for supplementary code points */
1113            //p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1114            for(i=0; i<index1Length; i++) {
1115                //*dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1116                dest.index[destIdx++] = (char)(UTRIE2_INDEX_2_OFFSET + index1[i+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH]);
1117            }
1118            if (UTRIE2_DEBUG) {
1119                System.out.println("Index 1 for supplementals, limit is " + Integer.toHexString(destIdx));
1120            }
1121
1122            /*
1123             * write the index-2 array values for supplementary code points,
1124             * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1125             */
1126            for(i=0; i<index2Length-index2Offset; i++) {
1127                dest.index[destIdx++] = (char)((dataMove + index2[index2Offset+i])>>UTRIE2_INDEX_SHIFT);
1128            }
1129            if (UTRIE2_DEBUG) {
1130                System.out.println("Index 2 for supplementals, limit is " + Integer.toHexString(destIdx));
1131            }
1132        }
1133
1134        /* write the 16/32-bit data array */
1135        switch(valueBits) {
1136        case BITS_16:
1137            /* write 16-bit data values */
1138            assert(destIdx == dataMove);
1139            dest.data16 = destIdx;
1140            for(i=0; i<dataLength; i++) {
1141                dest.index[destIdx++] = (char)data[i];
1142            }
1143            break;
1144        case BITS_32:
1145            /* write 32-bit data values */
1146            for (i=0; i<dataLength; i++) {
1147                dest.data32[i] = this.data[i];
1148            }
1149            break;
1150        }
1151        // The writable, but compressed, Trie2 stays around unless the caller drops its references to it.
1152    }
1153
1154
1155    /* Start with allocation of 16k data entries. */
1156    private static final int UNEWTRIE2_INITIAL_DATA_LENGTH = 1<<14;
1157
1158    /* Grow about 8x each time. */
1159    private static final int UNEWTRIE2_MEDIUM_DATA_LENGTH = 1<<17;
1160
1161    /** The null index-2 block, following the gap in the index-2 table. */
1162    private static final int UNEWTRIE2_INDEX_2_NULL_OFFSET = UNEWTRIE2_INDEX_GAP_OFFSET + UNEWTRIE2_INDEX_GAP_LENGTH;
1163
1164    /** The start of allocated index-2 blocks. */
1165    private static final int UNEWTRIE2_INDEX_2_START_OFFSET = UNEWTRIE2_INDEX_2_NULL_OFFSET + UTRIE2_INDEX_2_BLOCK_LENGTH;
1166
1167    /**
1168     * The null data block.
1169     * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
1170     * to work with 6-bit trail bytes from 2-byte UTF-8.
1171     */
1172    private static final int UNEWTRIE2_DATA_NULL_OFFSET = UTRIE2_DATA_START_OFFSET;
1173
1174    /** The start of allocated data blocks. */
1175    private static final int UNEWTRIE2_DATA_START_OFFSET = UNEWTRIE2_DATA_NULL_OFFSET+0x40;
1176
1177    /**
1178     * The start of data blocks for U+0800 and above.
1179     * Below, compaction uses a block length of 64 for 2-byte UTF-8.
1180     * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
1181     * Data values for 0x780 code points beyond ASCII.
1182     */
1183    private static final int UNEWTRIE2_DATA_0800_OFFSET = UNEWTRIE2_DATA_START_OFFSET+0x780;
1184
1185    //
1186    // Private data members.  From struct UNewTrie2 in ICU4C
1187    //
1188    private  int[]   index1 = new int[UNEWTRIE2_INDEX_1_LENGTH];
1189    private  int[]   index2 = new int[UNEWTRIE2_MAX_INDEX_2_LENGTH];
1190    private  int[]   data;
1191
1192    private  int     index2Length;
1193    private  int     dataCapacity;
1194    private  int     firstFreeBlock;
1195    private  int     index2NullOffset;
1196    private  boolean isCompacted;
1197
1198
1199    /*
1200     * Multi-purpose per-data-block table.
1201     *
1202     * Before compacting:
1203     *
1204     * Per-data-block reference counters/free-block list.
1205     *  0: unused
1206     * >0: reference counter (number of index-2 entries pointing here)
1207     * <0: next free data block in free-block list
1208     *
1209     * While compacting:
1210     *
1211     * Map of adjusted indexes, used in compactData() and compactIndex2().
1212     * Maps from original indexes to new ones.
1213     */
1214     private  int[]   map = new int[UNEWTRIE2_MAX_DATA_LENGTH>>UTRIE2_SHIFT_2];
1215
1216
1217     private boolean UTRIE2_DEBUG = false;
1218
1219}
1220