1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/**
4*******************************************************************************
5* Copyright (C) 1996-2014, International Business Machines Corporation and
6* others. All Rights Reserved.
7*******************************************************************************
8*/
9package com.ibm.icu.impl.coll;
10
11import com.ibm.icu.util.ByteArrayWrapper;
12
13/**
14 * <p>Binary Ordered Compression Scheme for Unicode</p>
15 *
16 * <p>Users are strongly encouraged to read the ICU paper on
17 * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html">
18 * BOCU</a> before attempting to use this class.</p>
19 *
20 * <p>BOCU is used to compress unicode text into a stream of unsigned
21 * bytes.  For many kinds of text the compression compares favorably
22 * to UTF-8, and for some kinds of text (such as CJK) it does better.
23 * The resulting bytes will compare in the same order as the original
24 * code points.  The byte stream does not contain the values 0, 1, or
25 * 2.</p>
26 *
27 * <p>One example of a use of BOCU is in
28 * com.ibm.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with
29 * collation strength IDENTICAL. The result CollationKey will consist of the
30 * collation order of the source string followed by the BOCU result of the
31 * source string.
32 * </p>
33 *
34 * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for
35 * random access.</p>
36 *
37 * <p>Method: Slope Detection<br> Remember the previous code point
38 * (initial 0).  For each code point in the string, encode the
39 * difference with the previous one.  Similar to a UTF, the length of
40 * the byte sequence is encoded in the lead bytes.  Unlike a UTF, the
41 * trail byte values may overlap with lead/single byte values.  The
42 * signedness of the difference must be encoded as the most
43 * significant part.</p>
44 *
45 * <p>We encode differences with few bytes if their absolute values
46 * are small.  For correct ordering, we must treat the entire value
47 * range -10ffff..+10ffff in ascending order, which forbids encoding
48 * the sign and the absolute value separately. Instead, we split the
49 * lead byte range in the middle and encode non-negative values going
50 * up and negative values going down.</p>
51 *
52 * <p>For very small absolute values, the difference is added to a
53 * middle byte value for single-byte encoded differences.  For
54 * somewhat larger absolute values, the difference is divided by the
55 * number of byte values available, the modulo is used for one trail
56 * byte, and the remainder is added to a lead byte avoiding the
57 * single-byte range.  For large absolute values, the difference is
58 * similarly encoded in three bytes. (Syn Wee, I need examples
59 * here.)</p>
60 *
61 * <p>BOCU does not use byte values 0, 1, or 2, but uses all other
62 * byte values for lead and single bytes, so that the middle range of
63 * single bytes is as large as possible.</p>
64 *
65 * <p>Note that the lead byte ranges overlap some, but that the
66 * sequences as a whole are well ordered. I.e., even if the lead byte
67 * is the same for sequences of different lengths, the trail bytes
68 * establish correct order.  It would be possible to encode slightly
69 * larger ranges for each length (>1) by subtracting the lower bound
70 * of the range. However, that would also slow down the calculation.
71 * (Syn Wee, need an example).</p>
72 *
73 * <p>For the actual string encoding, an optimization moves the
74 * previous code point value to the middle of its Unicode script block
75 * to minimize the differences in same-script text runs.  (Syn Wee,
76 * need an example.)</p>
77 *
78 * @author Syn Wee Quek
79 * @since release 2.2, May 3rd 2002
80 */
81public class BOCSU
82{
83    // public methods -------------------------------------------------------
84
85    /**
86     * Encode the code points of a string as
87     * a sequence of byte-encoded differences (slope detection),
88     * preserving lexical order.
89     *
90     * <p>Optimize the difference-taking for runs of Unicode text within
91     * small scripts:
92     *
93     * <p>Most small scripts are allocated within aligned 128-blocks of Unicode
94     * code points. Lexical order is preserved if "prev" is always moved
95     * into the middle of such a block.
96     *
97     * <p>Additionally, "prev" is moved from anywhere in the Unihan
98     * area into the middle of that area.
99     * Note that the identical-level run in a sort key is generated from
100     * NFD text - there are never Hangul characters included.
101     */
102    public static int writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink) {
103        while (i < length) {
104            // We must have capacity>=SLOPE_MAX_BYTES in case writeDiff() writes that much,
105            // but we do not want to force the sink to allocate
106            // for a large min_capacity because we might actually only write one byte.
107            ensureAppendCapacity(sink, 16, s.length() * 2);
108            byte[] buffer = sink.bytes;
109            int capacity = buffer.length;
110            int p = sink.size;
111            int lastSafe = capacity - SLOPE_MAX_BYTES_;
112            while (i < length && p <= lastSafe) {
113                if (prev < 0x4e00 || prev >= 0xa000) {
114                    prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
115                } else {
116                    // Unihan U+4e00..U+9fa5:
117                    // double-bytes down from the upper end
118                    prev = 0x9fff - SLOPE_REACH_POS_2_;
119                }
120
121                int c = Character.codePointAt(s, i);
122                i += Character.charCount(c);
123                if (c == 0xfffe) {
124                    buffer[p++] = 2;  // merge separator
125                    prev = 0;
126                } else {
127                    p = writeDiff(c - prev, buffer, p);
128                    prev = c;
129                }
130            }
131            sink.size = p;
132        }
133        return prev;
134    }
135
136    private static void ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity) {
137        int remainingCapacity = sink.bytes.length - sink.size;
138        if (remainingCapacity >= minCapacity) { return; }
139        if (desiredCapacity < minCapacity) { desiredCapacity = minCapacity; }
140        sink.ensureCapacity(sink.size + desiredCapacity);
141    }
142
143    // private data members --------------------------------------------------
144
145    /**
146     * Do not use byte values 0, 1, 2 because they are separators in sort keys.
147     */
148    private static final int SLOPE_MIN_ = 3;
149    private static final int SLOPE_MAX_ = 0xff;
150    private static final int SLOPE_MIDDLE_ = 0x81;
151    private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
152    private static final int SLOPE_MAX_BYTES_ = 4;
153
154    /**
155     * Number of lead bytes:
156     * 1        middle byte for 0
157     * 2*80=160 single bytes for !=0
158     * 2*42=84  for double-byte values
159     * 2*3=6    for 3-byte values
160     * 2*1=2    for 4-byte values
161     *
162     * The sum must be <=SLOPE_TAIL_COUNT.
163     *
164     * Why these numbers?
165     * - There should be >=128 single-byte values to cover 128-blocks
166     *   with small scripts.
167     * - There should be >=20902 single/double-byte values to cover Unihan.
168     * - It helps CJK Extension B some if there are 3-byte values that cover
169     *   the distance between them and Unihan.
170     *   This also helps to jump among distant places in the BMP.
171     * - Four-byte values are necessary to cover the rest of Unicode.
172     *
173     * Symmetrical lead byte counts are for convenience.
174     * With an equal distribution of even and odd differences there is also
175     * no advantage to asymmetrical lead byte counts.
176     */
177    private static final int SLOPE_SINGLE_ = 80;
178    private static final int SLOPE_LEAD_2_ = 42;
179    private static final int SLOPE_LEAD_3_ = 3;
180    //private static final int SLOPE_LEAD_4_ = 1;
181
182    /**
183     * The difference value range for single-byters.
184     */
185    private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
186    private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
187
188    /**
189     * The difference value range for double-byters.
190     */
191    private static final int SLOPE_REACH_POS_2_ =
192        SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
193    private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
194
195    /**
196     * The difference value range for 3-byters.
197     */
198    private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_
199        * SLOPE_TAIL_COUNT_
200        * SLOPE_TAIL_COUNT_
201        + (SLOPE_LEAD_3_ - 1)
202        * SLOPE_TAIL_COUNT_ +
203        (SLOPE_TAIL_COUNT_ - 1);
204    private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
205
206    /**
207     * The lead byte start values.
208     */
209    private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_
210        + SLOPE_SINGLE_ + 1;
211    private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_
212        + SLOPE_LEAD_2_;
213    private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ +
214        SLOPE_REACH_NEG_1_;
215    private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
216        - SLOPE_LEAD_2_;
217
218    // private constructor ---------------------------------------------------
219
220    /**
221     * Constructor private to prevent initialization
222     */
223    ///CLOVER:OFF
224    private BOCSU()
225    {
226    }
227    ///CLOVER:ON
228
229    // private methods -------------------------------------------------------
230
231    /**
232     * Integer division and modulo with negative numerators
233     * yields negative modulo results and quotients that are one more than
234     * what we need here.
235     * @param number which operations are to be performed on
236     * @param factor the factor to use for division
237     * @return (result of division) << 32 | modulo
238     */
239    private static final long getNegDivMod(int number, int factor)
240    {
241        int modulo = number % factor;
242        long result = number / factor;
243        if (modulo < 0) {
244            -- result;
245            modulo += factor;
246        }
247        return (result << 32) | modulo;
248    }
249
250    /**
251     * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
252     * preserving lexical order
253     * @param diff
254     * @param buffer byte buffer to append to
255     * @param offset to the byte buffer to start appending
256     * @return end offset where the appending stops
257     */
258    private static final int writeDiff(int diff, byte buffer[], int offset)
259    {
260        if (diff >= SLOPE_REACH_NEG_1_) {
261            if (diff <= SLOPE_REACH_POS_1_) {
262                buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);
263            }
264            else if (diff <= SLOPE_REACH_POS_2_) {
265                buffer[offset ++] = (byte)(SLOPE_START_POS_2_
266                                           + (diff / SLOPE_TAIL_COUNT_));
267                buffer[offset ++] = (byte)(SLOPE_MIN_ +
268                                           (diff % SLOPE_TAIL_COUNT_));
269            }
270            else if (diff <= SLOPE_REACH_POS_3_) {
271                buffer[offset + 2] = (byte)(SLOPE_MIN_
272                                            + (diff % SLOPE_TAIL_COUNT_));
273                diff /= SLOPE_TAIL_COUNT_;
274                buffer[offset + 1] = (byte)(SLOPE_MIN_
275                                            + (diff % SLOPE_TAIL_COUNT_));
276                buffer[offset] = (byte)(SLOPE_START_POS_3_
277                                        + (diff / SLOPE_TAIL_COUNT_));
278                offset += 3;
279            }
280            else {
281                buffer[offset + 3] = (byte)(SLOPE_MIN_
282                                            + diff % SLOPE_TAIL_COUNT_);
283                diff /= SLOPE_TAIL_COUNT_;
284                buffer[offset + 2] = (byte)(SLOPE_MIN_
285                                        + diff % SLOPE_TAIL_COUNT_);
286                diff /= SLOPE_TAIL_COUNT_;
287                buffer[offset + 1] = (byte)(SLOPE_MIN_
288                                            + diff % SLOPE_TAIL_COUNT_);
289                buffer[offset] = (byte)SLOPE_MAX_;
290                offset += 4;
291            }
292        }
293        else {
294            long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
295            int modulo = (int)division;
296            if (diff >= SLOPE_REACH_NEG_2_) {
297                diff = (int)(division >> 32);
298                buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);
299                buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);
300            }
301            else if (diff >= SLOPE_REACH_NEG_3_) {
302                buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
303                diff = (int)(division >> 32);
304                division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
305                modulo = (int)division;
306                diff = (int)(division >> 32);
307                buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
308                buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);
309                offset += 3;
310            }
311            else {
312                buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);
313                diff = (int)(division >> 32);
314                division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
315                modulo = (int)division;
316                diff = (int)(division >> 32);
317                buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
318                division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
319                modulo = (int)division;
320                buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
321                buffer[offset] = SLOPE_MIN_;
322                offset += 4;
323            }
324        }
325        return offset;
326    }
327}
328