1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_BYTE_ARRAY_UTILS_H
18#define LATINIME_BYTE_ARRAY_UTILS_H
19
20#include <cstdint>
21
22#include "defines.h"
23
24namespace latinime {
25
26/**
27 * Utility methods for reading byte arrays.
28 */
29class ByteArrayUtils {
30 public:
31    /**
32     * Integer writing
33     *
34     * Each method write a corresponding size integer in a big endian manner.
35     */
36    static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer,
37            const uint32_t data, const int size, int *const pos) {
38        // size must be in 1 to 4.
39        ASSERT(size >= 1 && size <= 4);
40        switch (size) {
41            case 1:
42                ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos);
43                return;
44            case 2:
45                ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos);
46                return;
47            case 3:
48                ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos);
49                return;
50            case 4:
51                ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos);
52                return;
53            default:
54                break;
55        }
56    }
57
58    /**
59     * Integer reading
60     *
61     * Each method read a corresponding size integer in a big endian manner.
62     */
63    static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) {
64        return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16)
65                ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3];
66    }
67
68    static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) {
69        return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2];
70    }
71
72    static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) {
73        return (buffer[pos] << 8) ^ buffer[pos + 1];
74    }
75
76    static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) {
77        return buffer[pos];
78    }
79
80    static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition(
81            const uint8_t *const buffer, int *const pos) {
82        const uint32_t value = readUint32(buffer, *pos);
83        *pos += 4;
84        return value;
85    }
86
87    static AK_FORCE_INLINE int readSint24AndAdvancePosition(
88            const uint8_t *const buffer, int *const pos) {
89        const uint8_t value = readUint8(buffer, *pos);
90        if (value < 0x80) {
91            return readUint24AndAdvancePosition(buffer, pos);
92        } else {
93            (*pos)++;
94            return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos));
95        }
96    }
97
98    static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition(
99            const uint8_t *const buffer, int *const pos) {
100        const uint32_t value = readUint24(buffer, *pos);
101        *pos += 3;
102        return value;
103    }
104
105    static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition(
106            const uint8_t *const buffer, int *const pos) {
107        const uint16_t value = readUint16(buffer, *pos);
108        *pos += 2;
109        return value;
110    }
111
112    static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition(
113            const uint8_t *const buffer, int *const pos) {
114        return buffer[(*pos)++];
115    }
116
117    static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer,
118            const int size, const int pos) {
119        // size must be in 1 to 4.
120        ASSERT(size >= 1 && size <= 4);
121        switch (size) {
122            case 1:
123                return ByteArrayUtils::readUint8(buffer, pos);
124            case 2:
125                return ByteArrayUtils::readUint16(buffer, pos);
126            case 3:
127                return ByteArrayUtils::readUint24(buffer, pos);
128            case 4:
129                return ByteArrayUtils::readUint32(buffer, pos);
130            default:
131                return 0;
132        }
133    }
134
135    /**
136     * Code Point Reading
137     *
138     * 1 byte = bbbbbbbb match
139     * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
140     * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
141     *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
142     *       00011111 would be outside unicode.
143     * else: iso-latin-1 code
144     * This allows for the whole unicode range to be encoded, including chars outside of
145     * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
146     * characters which should never happen anyway (and still work, but take 3 bytes).
147     */
148    static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
149        int p = pos;
150        return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
151    }
152
153    static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
154            const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
155        /*
156         * codePointTable is an array to convert the most frequent characters in this dictionary to
157         * 1 byte code points. It is only made of the original code points of the most frequent
158         * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
159         * The original code points are restored by picking the code points at the indices of the
160         * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
161         */
162        const uint8_t firstByte = readUint8(buffer, *pos);
163        if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
164            if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
165                *pos += 1;
166                return NOT_A_CODE_POINT;
167            } else {
168                return readUint24AndAdvancePosition(buffer, pos);
169            }
170        } else {
171            *pos += 1;
172            if (codePointTable) {
173                return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
174            }
175            return firstByte;
176        }
177    }
178
179    /**
180     * String (array of code points) Reading
181     *
182     * Reads code points until the terminator is found.
183     */
184    // Returns the length of the string.
185    static int readStringAndAdvancePosition(const uint8_t *const buffer,
186            const int maxLength, const int *const codePointTable, int *const outBuffer,
187            int *const pos) {
188        int length = 0;
189        int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
190        while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
191            outBuffer[length++] = codePoint;
192            codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
193        }
194        return length;
195    }
196
197    // Advances the position and returns the length of the string.
198    static int advancePositionToBehindString(
199            const uint8_t *const buffer, const int maxLength, int *const pos) {
200        int length = 0;
201        int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
202        while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
203            codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
204            length++;
205        }
206        return length;
207    }
208
209    /**
210     * String (array of code points) Writing
211     */
212    static void writeCodePointsAndAdvancePosition(uint8_t *const buffer,
213            const int *const codePoints, const int codePointCount, const bool writesTerminator,
214            int *const pos) {
215        for (int i = 0; i < codePointCount; ++i) {
216            const int codePoint = codePoints[i];
217            if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
218                break;
219            } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
220                    || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
221                // three bytes character.
222                writeUint24AndAdvancePosition(buffer, codePoint, pos);
223            } else {
224                // one byte character.
225                writeUint8AndAdvancePosition(buffer, codePoint, pos);
226            }
227        }
228        if (writesTerminator) {
229            writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos);
230        }
231    }
232
233    static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints,
234            const int codePointCount, const bool writesTerminator) {
235        int byteCount = 0;
236        for (int i = 0; i < codePointCount; ++i) {
237            const int codePoint = codePoints[i];
238            if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
239                break;
240            } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
241                    || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
242                // three bytes character.
243                byteCount += 3;
244            } else {
245                // one byte character.
246                byteCount += 1;
247            }
248        }
249        if (writesTerminator) {
250            // The terminator is one byte.
251            byteCount += 1;
252        }
253        return byteCount;
254    }
255
256 private:
257    DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
258
259    static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
260    static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
261    static const uint8_t CHARACTER_ARRAY_TERMINATOR;
262
263    static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
264            const uint32_t data, int *const pos) {
265        buffer[(*pos)++] = (data >> 24) & 0xFF;
266        buffer[(*pos)++] = (data >> 16) & 0xFF;
267        buffer[(*pos)++] = (data >> 8) & 0xFF;
268        buffer[(*pos)++] = data & 0xFF;
269    }
270
271    static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer,
272            const uint32_t data, int *const pos) {
273        buffer[(*pos)++] = (data >> 16) & 0xFF;
274        buffer[(*pos)++] = (data >> 8) & 0xFF;
275        buffer[(*pos)++] = data & 0xFF;
276    }
277
278    static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer,
279            const uint16_t data, int *const pos) {
280        buffer[(*pos)++] = (data >> 8) & 0xFF;
281        buffer[(*pos)++] = data & 0xFF;
282    }
283
284    static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer,
285            const uint8_t data, int *const pos) {
286        buffer[(*pos)++] = data & 0xFF;
287    }
288};
289} // namespace latinime
290#endif /* LATINIME_BYTE_ARRAY_UTILS_H */
291