1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2014, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  collationweights.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2001mar08 as ucol_wgt.cpp
14*   created by: Markus W. Scherer
15*
16*   This file contains code for allocating n collation element weights
17*   between two exclusive limits.
18*   It is used only internally by the collation tailoring builder.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "cmemory.h"
26#include "collation.h"
27#include "collationweights.h"
28#include "uarrsort.h"
29#include "uassert.h"
30
31#ifdef UCOL_DEBUG
32#   include <stdio.h>
33#endif
34
35U_NAMESPACE_BEGIN
36
37/* collation element weight allocation -------------------------------------- */
38
39/* helper functions for CE weights */
40
41static inline uint32_t
42getWeightTrail(uint32_t weight, int32_t length) {
43    return (uint32_t)(weight>>(8*(4-length)))&0xff;
44}
45
46static inline uint32_t
47setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
48    length=8*(4-length);
49    return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
50}
51
52static inline uint32_t
53getWeightByte(uint32_t weight, int32_t idx) {
54    return getWeightTrail(weight, idx); /* same calculation */
55}
56
57static inline uint32_t
58setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
59    uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
60
61    idx*=8;
62    if(idx<32) {
63        mask=((uint32_t)0xffffffff)>>idx;
64    } else {
65        // Do not use uint32_t>>32 because on some platforms that does not shift at all
66        // while we need it to become 0.
67        // PowerPC: 0xffffffff>>32 = 0           (wanted)
68        // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
69        //
70        // ANSI C99 6.5.7 Bitwise shift operators:
71        // "If the value of the right operand is negative
72        // or is greater than or equal to the width of the promoted left operand,
73        // the behavior is undefined."
74        mask=0;
75    }
76    idx=32-idx;
77    mask|=0xffffff00<<idx;
78    return (uint32_t)((weight&mask)|(byte<<idx));
79}
80
81static inline uint32_t
82truncateWeight(uint32_t weight, int32_t length) {
83    return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
84}
85
86static inline uint32_t
87incWeightTrail(uint32_t weight, int32_t length) {
88    return (uint32_t)(weight+(1UL<<(8*(4-length))));
89}
90
91static inline uint32_t
92decWeightTrail(uint32_t weight, int32_t length) {
93    return (uint32_t)(weight-(1UL<<(8*(4-length))));
94}
95
96CollationWeights::CollationWeights()
97        : middleLength(0), rangeIndex(0), rangeCount(0) {
98    for(int32_t i = 0; i < 5; ++i) {
99        minBytes[i] = maxBytes[i] = 0;
100    }
101}
102
103void
104CollationWeights::initForPrimary(UBool compressible) {
105    middleLength=1;
106    minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
107    maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
108    if(compressible) {
109        minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
110        maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
111    } else {
112        minBytes[2] = 2;
113        maxBytes[2] = 0xff;
114    }
115    minBytes[3] = 2;
116    maxBytes[3] = 0xff;
117    minBytes[4] = 2;
118    maxBytes[4] = 0xff;
119}
120
121void
122CollationWeights::initForSecondary() {
123    // We use only the lower 16 bits for secondary weights.
124    middleLength=3;
125    minBytes[1] = 0;
126    maxBytes[1] = 0;
127    minBytes[2] = 0;
128    maxBytes[2] = 0;
129    minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
130    maxBytes[3] = 0xff;
131    minBytes[4] = 2;
132    maxBytes[4] = 0xff;
133}
134
135void
136CollationWeights::initForTertiary() {
137    // We use only the lower 16 bits for tertiary weights.
138    middleLength=3;
139    minBytes[1] = 0;
140    maxBytes[1] = 0;
141    minBytes[2] = 0;
142    maxBytes[2] = 0;
143    // We use only 6 bits per byte.
144    // The other bits are used for case & quaternary weights.
145    minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;
146    maxBytes[3] = 0x3f;
147    minBytes[4] = 2;
148    maxBytes[4] = 0x3f;
149}
150
151uint32_t
152CollationWeights::incWeight(uint32_t weight, int32_t length) const {
153    for(;;) {
154        uint32_t byte=getWeightByte(weight, length);
155        if(byte<maxBytes[length]) {
156            return setWeightByte(weight, length, byte+1);
157        } else {
158            // Roll over, set this byte to the minimum and increment the previous one.
159            weight=setWeightByte(weight, length, minBytes[length]);
160            --length;
161            U_ASSERT(length > 0);
162        }
163    }
164}
165
166uint32_t
167CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
168    for(;;) {
169        offset += getWeightByte(weight, length);
170        if((uint32_t)offset <= maxBytes[length]) {
171            return setWeightByte(weight, length, offset);
172        } else {
173            // Split the offset between this byte and the previous one.
174            offset -= minBytes[length];
175            weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
176            offset /= countBytes(length);
177            --length;
178            U_ASSERT(length > 0);
179        }
180    }
181}
182
183void
184CollationWeights::lengthenRange(WeightRange &range) const {
185    int32_t length=range.length+1;
186    range.start=setWeightTrail(range.start, length, minBytes[length]);
187    range.end=setWeightTrail(range.end, length, maxBytes[length]);
188    range.count*=countBytes(length);
189    range.length=length;
190}
191
192/* for uprv_sortArray: sort ranges in weight order */
193static int32_t U_CALLCONV
194compareRanges(const void * /*context*/, const void *left, const void *right) {
195    uint32_t l, r;
196
197    l=((const CollationWeights::WeightRange *)left)->start;
198    r=((const CollationWeights::WeightRange *)right)->start;
199    if(l<r) {
200        return -1;
201    } else if(l>r) {
202        return 1;
203    } else {
204        return 0;
205    }
206}
207
208UBool
209CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
210    U_ASSERT(lowerLimit != 0);
211    U_ASSERT(upperLimit != 0);
212
213    /* get the lengths of the limits */
214    int32_t lowerLength=lengthOfWeight(lowerLimit);
215    int32_t upperLength=lengthOfWeight(upperLimit);
216
217#ifdef UCOL_DEBUG
218    printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
219    printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
220#endif
221    U_ASSERT(lowerLength>=middleLength);
222    // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
223
224    if(lowerLimit>=upperLimit) {
225#ifdef UCOL_DEBUG
226        printf("error: no space between lower & upper limits\n");
227#endif
228        return FALSE;
229    }
230
231    /* check that neither is a prefix of the other */
232    if(lowerLength<upperLength) {
233        if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
234#ifdef UCOL_DEBUG
235            printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
236#endif
237            return FALSE;
238        }
239    }
240    /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
241
242    WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
243    uprv_memset(lower, 0, sizeof(lower));
244    uprv_memset(&middle, 0, sizeof(middle));
245    uprv_memset(upper, 0, sizeof(upper));
246
247    /*
248     * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
249     * range     minimum length
250     * lower[4]  4
251     * lower[3]  3
252     * lower[2]  2
253     * middle    1
254     * upper[2]  2
255     * upper[3]  3
256     * upper[4]  4
257     *
258     * We are now going to calculate up to 7 ranges.
259     * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
260     */
261    uint32_t weight=lowerLimit;
262    for(int32_t length=lowerLength; length>middleLength; --length) {
263        uint32_t trail=getWeightTrail(weight, length);
264        if(trail<maxBytes[length]) {
265            lower[length].start=incWeightTrail(weight, length);
266            lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
267            lower[length].length=length;
268            lower[length].count=maxBytes[length]-trail;
269        }
270        weight=truncateWeight(weight, length-1);
271    }
272    if(weight<0xff000000) {
273        middle.start=incWeightTrail(weight, middleLength);
274    } else {
275        // Prevent overflow for primary lead byte FF
276        // which would yield a middle range starting at 0.
277        middle.start=0xffffffff;  // no middle range
278    }
279
280    weight=upperLimit;
281    for(int32_t length=upperLength; length>middleLength; --length) {
282        uint32_t trail=getWeightTrail(weight, length);
283        if(trail>minBytes[length]) {
284            upper[length].start=setWeightTrail(weight, length, minBytes[length]);
285            upper[length].end=decWeightTrail(weight, length);
286            upper[length].length=length;
287            upper[length].count=trail-minBytes[length];
288        }
289        weight=truncateWeight(weight, length-1);
290    }
291    middle.end=decWeightTrail(weight, middleLength);
292
293    /* set the middle range */
294    middle.length=middleLength;
295    if(middle.end>=middle.start) {
296        middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
297    } else {
298        /* no middle range, eliminate overlaps */
299
300        /* reduce or remove the lower ranges that go beyond upperLimit */
301        for(int32_t length=4; length>middleLength; --length) {
302            if(lower[length].count>0 && upper[length].count>0) {
303                uint32_t start=upper[length].start;
304                uint32_t end=lower[length].end;
305
306                if(end>=start || incWeight(end, length)==start) {
307                    /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
308                    start=lower[length].start;
309                    end=lower[length].end=upper[length].end;
310                    /*
311                     * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
312                     * it may result in a range with count>countBytes
313                     */
314                    lower[length].count=
315                        (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
316                                  countBytes(length)*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
317                    upper[length].count=0;
318                    while(--length>middleLength) {
319                        lower[length].count=upper[length].count=0;
320                    }
321                    break;
322                }
323            }
324        }
325    }
326
327#ifdef UCOL_DEBUG
328    /* print ranges */
329    for(int32_t length=4; length>=2; --length) {
330        if(lower[length].count>0) {
331            printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
332        }
333    }
334    if(middle.count>0) {
335        printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
336    }
337    for(int32_t length=2; length<=4; ++length) {
338        if(upper[length].count>0) {
339            printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
340        }
341    }
342#endif
343
344    /* copy the ranges, shortest first, into the result array */
345    rangeCount=0;
346    if(middle.count>0) {
347        uprv_memcpy(ranges, &middle, sizeof(WeightRange));
348        rangeCount=1;
349    }
350    for(int32_t length=middleLength+1; length<=4; ++length) {
351        /* copy upper first so that later the middle range is more likely the first one to use */
352        if(upper[length].count>0) {
353            uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
354            ++rangeCount;
355        }
356        if(lower[length].count>0) {
357            uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
358            ++rangeCount;
359        }
360    }
361    return rangeCount>0;
362}
363
364UBool
365CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
366    // See if the first few minLength and minLength+1 ranges have enough weights.
367    for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
368        if(n <= ranges[i].count) {
369            // Use the first few minLength and minLength+1 ranges.
370            if(ranges[i].length > minLength) {
371                // Reduce the number of weights from the last minLength+1 range
372                // which might sort before some minLength ranges,
373                // so that we use all weights in the minLength ranges.
374                ranges[i].count = n;
375            }
376            rangeCount = i + 1;
377#ifdef UCOL_DEBUG
378            printf("take first %ld ranges\n", rangeCount);
379#endif
380
381            if(rangeCount>1) {
382                /* sort the ranges by weight values */
383                UErrorCode errorCode=U_ZERO_ERROR;
384                uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
385                               compareRanges, NULL, FALSE, &errorCode);
386                /* ignore error code: we know that the internal sort function will not fail here */
387            }
388            return TRUE;
389        }
390        n -= ranges[i].count;  // still >0
391    }
392    return FALSE;
393}
394
395UBool
396CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
397    // See if the minLength ranges have enough weights
398    // when we split one and lengthen the following ones.
399    int32_t count = 0;
400    int32_t minLengthRangeCount;
401    for(minLengthRangeCount = 0;
402            minLengthRangeCount < rangeCount &&
403                ranges[minLengthRangeCount].length == minLength;
404            ++minLengthRangeCount) {
405        count += ranges[minLengthRangeCount].count;
406    }
407
408    int32_t nextCountBytes = countBytes(minLength + 1);
409    if(n > count * nextCountBytes) { return FALSE; }
410
411    // Use the minLength ranges. Merge them, and then split again as necessary.
412    uint32_t start = ranges[0].start;
413    uint32_t end = ranges[0].end;
414    for(int32_t i = 1; i < minLengthRangeCount; ++i) {
415        if(ranges[i].start < start) { start = ranges[i].start; }
416        if(ranges[i].end > end) { end = ranges[i].end; }
417    }
418
419    // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
420    // Goal:
421    //   count1 + count2 * nextCountBytes = n
422    //   count1 + count2 = count
423    // These turn into
424    //   (count - count2) + count2 * nextCountBytes = n
425    // and then into the following count1 & count2 computations.
426    int32_t count2 = (n - count) / (nextCountBytes - 1);  // number of weights to be lengthened
427    int32_t count1 = count - count2;  // number of minLength weights
428    if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
429        // round up
430        ++count2;
431        --count1;
432        U_ASSERT((count1 + count2 * nextCountBytes) >= n);
433    }
434
435    ranges[0].start = start;
436
437    if(count1 == 0) {
438        // Make one long range.
439        ranges[0].end = end;
440        ranges[0].count = count;
441        lengthenRange(ranges[0]);
442        rangeCount = 1;
443    } else {
444        // Split the range, lengthen the second part.
445#ifdef UCOL_DEBUG
446        printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
447               splitRange, rangeCount, count1, count2);
448#endif
449
450        // Next start = start + count1. First end = 1 before that.
451        ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
452        ranges[0].count = count1;
453
454        ranges[1].start = incWeight(ranges[0].end, minLength);
455        ranges[1].end = end;
456        ranges[1].length = minLength;  // +1 when lengthened
457        ranges[1].count = count2;  // *countBytes when lengthened
458        lengthenRange(ranges[1]);
459        rangeCount = 2;
460    }
461    return TRUE;
462}
463
464/*
465 * call getWeightRanges and then determine heuristically
466 * which ranges to use for a given number of weights between (excluding)
467 * two limits
468 */
469UBool
470CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
471#ifdef UCOL_DEBUG
472    puts("");
473#endif
474
475    if(!getWeightRanges(lowerLimit, upperLimit)) {
476#ifdef UCOL_DEBUG
477        printf("error: unable to get Weight ranges\n");
478#endif
479        return FALSE;
480    }
481
482    /* try until we find suitably large ranges */
483    for(;;) {
484        /* get the smallest number of bytes in a range */
485        int32_t minLength=ranges[0].length;
486
487        if(allocWeightsInShortRanges(n, minLength)) { break; }
488
489        if(minLength == 4) {
490#ifdef UCOL_DEBUG
491            printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
492                   minLengthCount, n);
493#endif
494            return FALSE;
495        }
496
497        if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
498
499        /* no good match, lengthen all minLength ranges and iterate */
500#ifdef UCOL_DEBUG
501        printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
502#endif
503        for(int32_t i=0; ranges[i].length==minLength; ++i) {
504            lengthenRange(ranges[i]);
505        }
506    }
507
508#ifdef UCOL_DEBUG
509    puts("final ranges:");
510    for(int32_t i=0; i<rangeCount; ++i) {
511        printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
512               i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
513    }
514#endif
515
516    rangeIndex = 0;
517    return TRUE;
518}
519
520uint32_t
521CollationWeights::nextWeight() {
522    if(rangeIndex >= rangeCount) {
523        return 0xffffffff;
524    } else {
525        /* get the next weight */
526        WeightRange &range = ranges[rangeIndex];
527        uint32_t weight = range.start;
528        if(--range.count == 0) {
529            /* this range is finished */
530            ++rangeIndex;
531        } else {
532            /* increment the weight for the next value */
533            range.start = incWeight(weight, range.length);
534            U_ASSERT(range.start <= range.end);
535        }
536
537        return weight;
538    }
539}
540
541U_NAMESPACE_END
542
543#endif /* #if !UCONFIG_NO_COLLATION */
544