CollationWeights.java revision bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7
1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2015, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   CollationWeights.java, ported from collationweights.h/.cpp
9*
10*   C++ version created on: 2001mar08 as ucol_wgt.h
11*   created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import java.util.Arrays;
17
18/**
19 * Allocates n collation element weights between two exclusive limits.
20 * Used only internally by the collation tailoring builder.
21 */
22public final class CollationWeights {
23    public CollationWeights() {}
24
25    public void initForPrimary(boolean compressible) {
26        middleLength=1;
27        minBytes[1] = Collation.MERGE_SEPARATOR_BYTE + 1;
28        maxBytes[1] = Collation.TRAIL_WEIGHT_BYTE;
29        if(compressible) {
30            minBytes[2] = Collation.PRIMARY_COMPRESSION_LOW_BYTE + 1;
31            maxBytes[2] = Collation.PRIMARY_COMPRESSION_HIGH_BYTE - 1;
32        } else {
33            minBytes[2] = 2;
34            maxBytes[2] = 0xff;
35        }
36        minBytes[3] = 2;
37        maxBytes[3] = 0xff;
38        minBytes[4] = 2;
39        maxBytes[4] = 0xff;
40    }
41
42    public void initForSecondary() {
43        // We use only the lower 16 bits for secondary weights.
44        middleLength=3;
45        minBytes[1] = 0;
46        maxBytes[1] = 0;
47        minBytes[2] = 0;
48        maxBytes[2] = 0;
49        minBytes[3] = Collation.LEVEL_SEPARATOR_BYTE + 1;
50        maxBytes[3] = 0xff;
51        minBytes[4] = 2;
52        maxBytes[4] = 0xff;
53    }
54
55    public void initForTertiary() {
56        // We use only the lower 16 bits for tertiary weights.
57        middleLength=3;
58        minBytes[1] = 0;
59        maxBytes[1] = 0;
60        minBytes[2] = 0;
61        maxBytes[2] = 0;
62        // We use only 6 bits per byte.
63        // The other bits are used for case & quaternary weights.
64        minBytes[3] = Collation.LEVEL_SEPARATOR_BYTE + 1;
65        maxBytes[3] = 0x3f;
66        minBytes[4] = 2;
67        maxBytes[4] = 0x3f;
68    }
69
70    /**
71     * Determine heuristically
72     * what ranges to use for a given number of weights between (excluding)
73     * two limits.
74     *
75     * @param lowerLimit A collation element weight; the ranges will be filled to cover
76     *                   weights greater than this one.
77     * @param upperLimit A collation element weight; the ranges will be filled to cover
78     *                   weights less than this one.
79     * @param n          The number of collation element weights w necessary such that
80     *                   lowerLimit<w<upperLimit in lexical order.
81     * @return true if it is possible to fit n elements between the limits
82     */
83    public boolean allocWeights(long lowerLimit, long upperLimit, int n) {
84        // Call getWeightRanges() and then determine heuristically
85        // which ranges to use for a given number of weights between (excluding)
86        // two limits.
87        // puts("");
88
89        if(!getWeightRanges(lowerLimit, upperLimit)) {
90            // printf("error: unable to get Weight ranges\n");
91            return false;
92        }
93
94        /* try until we find suitably large ranges */
95        for(;;) {
96            /* get the smallest number of bytes in a range */
97            int minLength=ranges[0].length;
98
99            if(allocWeightsInShortRanges(n, minLength)) { break; }
100
101            if(minLength == 4) {
102                // printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
103                //       minLengthCount, n);
104                return false;
105            }
106
107            if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
108
109            /* no good match, lengthen all minLength ranges and iterate */
110            // printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
111            for(int i=0; ranges[i].length==minLength; ++i) {
112                lengthenRange(ranges[i]);
113            }
114        }
115
116        /* puts("final ranges:");
117        for(int i=0; i<rangeCount; ++i) {
118            printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
119                  i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
120        } */
121
122        rangeIndex = 0;
123        if(rangeCount < ranges.length) {
124            ranges[rangeCount] = null;  // force a crash when going out of bounds
125        }
126        return true;
127    }
128
129    /**
130     * Given a set of ranges calculated by allocWeights(),
131     * iterate through the weights.
132     * The ranges are modified to keep the current iteration state.
133     *
134     * @return The next weight in the ranges, or 0xffffffff if there is none left.
135     */
136    public long nextWeight() {
137        if(rangeIndex >= rangeCount) {
138            return 0xffffffffL;
139        } else {
140            /* get the next weight */
141            WeightRange range = ranges[rangeIndex];
142            long weight = range.start;
143            if(--range.count == 0) {
144                /* this range is finished */
145                ++rangeIndex;
146            } else {
147                /* increment the weight for the next value */
148                range.start = incWeight(weight, range.length);
149                assert(range.start <= range.end);
150            }
151
152            return weight;
153        }
154    }
155
156    /** @internal */
157    private static final class WeightRange implements Comparable<WeightRange> {
158        long start, end;
159        int length, count;
160
161        // Java 6: @Override
162        public int compareTo(WeightRange other) {
163            long l=start;
164            long r=other.start;
165            if(l<r) {
166                return -1;
167            } else if(l>r) {
168                return 1;
169            } else {
170                return 0;
171            }
172        }
173    }
174
175    /* helper functions for CE weights */
176
177    public static int lengthOfWeight(long weight) {
178        if((weight&0xffffff)==0) {
179            return 1;
180        } else if((weight&0xffff)==0) {
181            return 2;
182        } else if((weight&0xff)==0) {
183            return 3;
184        } else {
185            return 4;
186        }
187    }
188
189    private static int getWeightTrail(long weight, int length) {
190        return (int)(weight>>(8*(4-length)))&0xff;
191    }
192
193    private static long setWeightTrail(long weight, int length, int trail) {
194        length=8*(4-length);
195        return (weight&(0xffffff00L<<length))|((long)trail<<length);
196    }
197
198    private static int getWeightByte(long weight, int idx) {
199        return getWeightTrail(weight, idx); /* same calculation */
200    }
201
202    private static long setWeightByte(long weight, int idx, int b) {
203        long mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
204
205        idx*=8;
206        if(idx<32) {
207            mask=0xffffffffL>>idx;
208        } else {
209            // Do not use int>>32 because on some platforms that does not shift at all
210            // while we need it to become 0.
211            // PowerPC: 0xffffffff>>32 = 0           (wanted)
212            // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
213            //
214            // ANSI C99 6.5.7 Bitwise shift operators:
215            // "If the value of the right operand is negative
216            // or is greater than or equal to the width of the promoted left operand,
217            // the behavior is undefined."
218            mask=0;
219        }
220        idx=32-idx;
221        mask|=0xffffff00L<<idx;
222        return (weight&mask)|((long)b<<idx);
223    }
224
225    private static long truncateWeight(long weight, int length) {
226        return weight&(0xffffffffL<<(8*(4-length)));
227    }
228
229    private static long incWeightTrail(long weight, int length) {
230        return weight+(1L<<(8*(4-length)));
231    }
232
233    private static long decWeightTrail(long weight, int length) {
234        return weight-(1L<<(8*(4-length)));
235    }
236
237    /** @return number of usable byte values for byte idx */
238    private int countBytes(int idx) {
239        return maxBytes[idx] - minBytes[idx] + 1;
240    }
241
242    private long incWeight(long weight, int length) {
243        for(;;) {
244            int b=getWeightByte(weight, length);
245            if(b<maxBytes[length]) {
246                return setWeightByte(weight, length, b+1);
247            } else {
248                // Roll over, set this byte to the minimum and increment the previous one.
249                weight=setWeightByte(weight, length, minBytes[length]);
250                --length;
251                assert(length > 0);
252            }
253        }
254    }
255
256    private long incWeightByOffset(long weight, int length, int offset) {
257        for(;;) {
258            offset += getWeightByte(weight, length);
259            if(offset <= maxBytes[length]) {
260                return setWeightByte(weight, length, offset);
261            } else {
262                // Split the offset between this byte and the previous one.
263                offset -= minBytes[length];
264                weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
265                offset /= countBytes(length);
266                --length;
267                assert(length > 0);
268            }
269        }
270    }
271
272    private void lengthenRange(WeightRange range) {
273        int length=range.length+1;
274        range.start=setWeightTrail(range.start, length, minBytes[length]);
275        range.end=setWeightTrail(range.end, length, maxBytes[length]);
276        range.count*=countBytes(length);
277        range.length=length;
278    }
279
280    /**
281     * Takes two CE weights and calculates the
282     * possible ranges of weights between the two limits, excluding them.
283     * For weights with up to 4 bytes there are up to 2*4-1=7 ranges.
284     */
285    private boolean getWeightRanges(long lowerLimit, long upperLimit) {
286        assert(lowerLimit != 0);
287        assert(upperLimit != 0);
288
289        /* get the lengths of the limits */
290        int lowerLength=lengthOfWeight(lowerLimit);
291        int upperLength=lengthOfWeight(upperLimit);
292
293        // printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
294        // printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
295        assert(lowerLength>=middleLength);
296        // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
297
298        if(lowerLimit>=upperLimit) {
299            // printf("error: no space between lower & upper limits\n");
300            return false;
301        }
302
303        /* check that neither is a prefix of the other */
304        if(lowerLength<upperLength) {
305            if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
306                // printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
307                return false;
308            }
309        }
310        /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
311
312        WeightRange[] lower = new WeightRange[5]; /* [0] and [1] are not used - this simplifies indexing */
313        WeightRange middle = new WeightRange();
314        WeightRange[] upper = new WeightRange[5];
315
316        /*
317         * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
318         * range     minimum length
319         * lower[4]  4
320         * lower[3]  3
321         * lower[2]  2
322         * middle    1
323         * upper[2]  2
324         * upper[3]  3
325         * upper[4]  4
326         *
327         * We are now going to calculate up to 7 ranges.
328         * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
329         */
330        long weight=lowerLimit;
331        for(int length=lowerLength; length>middleLength; --length) {
332            int trail=getWeightTrail(weight, length);
333            if(trail<maxBytes[length]) {
334                lower[length] = new WeightRange();
335                lower[length].start=incWeightTrail(weight, length);
336                lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
337                lower[length].length=length;
338                lower[length].count=maxBytes[length]-trail;
339            }
340            weight=truncateWeight(weight, length-1);
341        }
342        if(weight<0xff000000L) {
343            middle.start=incWeightTrail(weight, middleLength);
344        } else {
345            // Prevent overflow for primary lead byte FF
346            // which would yield a middle range starting at 0.
347            middle.start=0xffffffffL;  // no middle range
348        }
349
350        weight=upperLimit;
351        for(int length=upperLength; length>middleLength; --length) {
352            int trail=getWeightTrail(weight, length);
353            if(trail>minBytes[length]) {
354                upper[length] = new WeightRange();
355                upper[length].start=setWeightTrail(weight, length, minBytes[length]);
356                upper[length].end=decWeightTrail(weight, length);
357                upper[length].length=length;
358                upper[length].count=trail-minBytes[length];
359            }
360            weight=truncateWeight(weight, length-1);
361        }
362        middle.end=decWeightTrail(weight, middleLength);
363
364        /* set the middle range */
365        middle.length=middleLength;
366        if(middle.end>=middle.start) {
367            middle.count=(int)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
368        } else {
369            /* no middle range, eliminate overlaps */
370            for(int length=4; length>middleLength; --length) {
371                if(lower[length] != null && upper[length] != null &&
372                        lower[length].count>0 && upper[length].count>0) {
373                    // Note: The lowerEnd and upperStart weights are versions of
374                    // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
375                    // truncated (still less-or-equal)
376                    // and then with their last bytes changed to the
377                    // maxByte (for lowerEnd) or minByte (for upperStart).
378                    final long lowerEnd=lower[length].end;
379                    final long upperStart=upper[length].start;
380                    boolean merged=false;
381
382                    if(lowerEnd>upperStart) {
383                        // These two lower and upper ranges collide.
384                        // Since lowerLimit<upperLimit and lowerEnd and upperStart
385                        // are versions with only their last bytes modified
386                        // (and following ones removed/reset to 0),
387                        // lowerEnd>upperStart is only possible
388                        // if the leading bytes are equal
389                        // and lastByte(lowerEnd)>lastByte(upperStart).
390                        assert(truncateWeight(lowerEnd, length-1)==
391                                truncateWeight(upperStart, length-1));
392                        // Intersect these two ranges.
393                        lower[length].end=upper[length].end;
394                        lower[length].count=
395                                getWeightTrail(lower[length].end, length)-
396                                getWeightTrail(lower[length].start, length)+1;
397                        // count might be <=0 in which case there is no room,
398                        // and the range-collecting code below will ignore this range.
399                        merged=true;
400                    } else if(lowerEnd==upperStart) {
401                        // Not possible, unless minByte==maxByte which is not allowed.
402                        assert(minBytes[length]<maxBytes[length]);
403                    } else /* lowerEnd<upperStart */ {
404                        if(incWeight(lowerEnd, length)==upperStart) {
405                            // Merge adjacent ranges.
406                            lower[length].end=upper[length].end;
407                            lower[length].count+=upper[length].count;  // might be >countBytes
408                            merged=true;
409                        }
410                    }
411                    if(merged) {
412                        // Remove all shorter ranges.
413                        // There was no room available for them between the ranges we just merged.
414                        upper[length].count=0;
415                        while(--length>middleLength) {
416                            lower[length]=upper[length]=null;
417                        }
418                        break;
419                    }
420                }
421            }
422        }
423
424        /* print ranges
425        for(int length=4; length>=2; --length) {
426            if(lower[length].count>0) {
427                printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
428            }
429        }
430        if(middle.count>0) {
431            printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
432        }
433        for(int length=2; length<=4; ++length) {
434            if(upper[length].count>0) {
435                printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
436            }
437        } */
438
439        /* copy the ranges, shortest first, into the result array */
440        rangeCount=0;
441        if(middle.count>0) {
442            ranges[0] = middle;
443            rangeCount=1;
444        }
445        for(int length=middleLength+1; length<=4; ++length) {
446            /* copy upper first so that later the middle range is more likely the first one to use */
447            if(upper[length] != null && upper[length].count>0) {
448                ranges[rangeCount++]=upper[length];
449            }
450            if(lower[length] != null && lower[length].count>0) {
451                ranges[rangeCount++]=lower[length];
452            }
453        }
454        return rangeCount>0;
455    }
456
457    private boolean allocWeightsInShortRanges(int n, int minLength) {
458        // See if the first few minLength and minLength+1 ranges have enough weights.
459        for(int i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
460            if(n <= ranges[i].count) {
461                // Use the first few minLength and minLength+1 ranges.
462                if(ranges[i].length > minLength) {
463                    // Reduce the number of weights from the last minLength+1 range
464                    // which might sort before some minLength ranges,
465                    // so that we use all weights in the minLength ranges.
466                    ranges[i].count = n;
467                }
468                rangeCount = i + 1;
469                // printf("take first %ld ranges\n", rangeCount);
470
471                if(rangeCount>1) {
472                    /* sort the ranges by weight values */
473                    Arrays.sort(ranges, 0, rangeCount);
474                }
475                return true;
476            }
477            n -= ranges[i].count;  // still >0
478        }
479        return false;
480    }
481
482    private boolean allocWeightsInMinLengthRanges(int n, int minLength) {
483        // See if the minLength ranges have enough weights
484        // when we split one and lengthen the following ones.
485        int count = 0;
486        int minLengthRangeCount;
487        for(minLengthRangeCount = 0;
488                minLengthRangeCount < rangeCount &&
489                    ranges[minLengthRangeCount].length == minLength;
490                ++minLengthRangeCount) {
491            count += ranges[minLengthRangeCount].count;
492        }
493
494        int nextCountBytes = countBytes(minLength + 1);
495        if(n > count * nextCountBytes) { return false; }
496
497        // Use the minLength ranges. Merge them, and then split again as necessary.
498        long start = ranges[0].start;
499        long end = ranges[0].end;
500        for(int i = 1; i < minLengthRangeCount; ++i) {
501            if(ranges[i].start < start) { start = ranges[i].start; }
502            if(ranges[i].end > end) { end = ranges[i].end; }
503        }
504
505        // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
506        // Goal:
507        //   count1 + count2 * nextCountBytes = n
508        //   count1 + count2 = count
509        // These turn into
510        //   (count - count2) + count2 * nextCountBytes = n
511        // and then into the following count1 & count2 computations.
512        int count2 = (n - count) / (nextCountBytes - 1);  // number of weights to be lengthened
513        int count1 = count - count2;  // number of minLength weights
514        if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
515            // round up
516            ++count2;
517            --count1;
518            assert((count1 + count2 * nextCountBytes) >= n);
519        }
520
521        ranges[0].start = start;
522
523        if(count1 == 0) {
524            // Make one long range.
525            ranges[0].end = end;
526            ranges[0].count = count;
527            lengthenRange(ranges[0]);
528            rangeCount = 1;
529        } else {
530            // Split the range, lengthen the second part.
531            // printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
532            //       splitRange, rangeCount, count1, count2);
533
534            // Next start = start + count1. First end = 1 before that.
535            ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
536            ranges[0].count = count1;
537
538            if(ranges[1] == null) {
539                ranges[1] = new WeightRange();
540            }
541            ranges[1].start = incWeight(ranges[0].end, minLength);
542            ranges[1].end = end;
543            ranges[1].length = minLength;  // +1 when lengthened
544            ranges[1].count = count2;  // *countBytes when lengthened
545            lengthenRange(ranges[1]);
546            rangeCount = 2;
547        }
548        return true;
549    }
550
551    private int middleLength;
552    private int[] minBytes = new int[5];  // for byte 1, 2, 3, 4
553    private int[] maxBytes = new int[5];
554    private WeightRange[] ranges = new WeightRange[7];
555    private int rangeIndex;
556    private int rangeCount;
557}
558