1/*
2 * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17#include <stdio.h>
18#include <stdlib.h>
19#include <stdint.h>
20#include "ucdn.h"
21
22typedef struct {
23    unsigned char category;
24    unsigned char combining;
25    unsigned char bidi_class;
26    unsigned char mirrored;
27    unsigned char east_asian_width;
28    unsigned char normalization_check;
29    unsigned char script;
30} UCDRecord;
31
32typedef struct {
33    unsigned short from, to;
34} MirrorPair;
35
36typedef struct {
37    int start;
38    short count, index;
39} Reindex;
40
41#include "unicodedata_db.h"
42
43/* constants required for Hangul (de)composition */
44#define SBASE 0xAC00
45#define LBASE 0x1100
46#define VBASE 0x1161
47#define TBASE 0x11A7
48#define SCOUNT 11172
49#define LCOUNT 19
50#define VCOUNT 21
51#define TCOUNT 28
52#define NCOUNT (VCOUNT * TCOUNT)
53
54static const UCDRecord *get_ucd_record(uint32_t code)
55{
56    int index, offset;
57
58    if (code >= 0x110000)
59        index = 0;
60    else {
61        index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
62        offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
63        index  = index1[index + offset] << SHIFT2;
64        offset = code & ((1<<SHIFT2) - 1);
65        index  = index2[index + offset];
66    }
67
68    return &ucd_records[index];
69}
70
71static const unsigned short *get_decomp_record(uint32_t code)
72{
73    int index, offset;
74
75    if (code >= 0x110000)
76        index = 0;
77    else {
78        index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
79            << DECOMP_SHIFT1;
80        offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
81        index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
82        offset = code & ((1<<DECOMP_SHIFT2) - 1);
83        index  = decomp_index2[index + offset];
84    }
85
86    return &decomp_data[index];
87}
88
89static int get_comp_index(uint32_t code, const Reindex *idx)
90{
91    int i;
92
93    for (i = 0; idx[i].start; i++) {
94        const Reindex *cur = &idx[i];
95        if (code < cur->start)
96            return -1;
97        if (code <= cur->start + cur->count) {
98            return cur->index + (code - cur->start);
99        }
100    }
101
102    return -1;
103}
104
105static int compare_mp(const void *a, const void *b)
106{
107    MirrorPair *mpa = (MirrorPair *)a;
108    MirrorPair *mpb = (MirrorPair *)b;
109    return mpa->from - mpb->from;
110}
111
112static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
113{
114    int si = code - SBASE;
115
116    if (si < 0 || si >= SCOUNT)
117        return 0;
118
119    if (si % TCOUNT) {
120        /* LV,T */
121        *a = SBASE + (si / TCOUNT) * TCOUNT;
122        *b = TBASE + (si % TCOUNT);
123        return 3;
124    } else {
125        /* L,V */
126        *a = LBASE + (si / NCOUNT);
127        *b = VBASE + (si % NCOUNT) / TCOUNT;
128        return 2;
129    }
130}
131
132static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
133{
134    if (b < VBASE || b >= (TBASE + TCOUNT))
135        return 0;
136
137    if ((a < LBASE || a >= (LBASE + LCOUNT))
138            && (a < SBASE || a >= (SBASE + SCOUNT)))
139        return 0;
140
141    if (a >= SBASE) {
142        /* LV,T */
143        *code = a + (b - TBASE);
144        return 3;
145    } else {
146        /* L,V */
147        int li = a - LBASE;
148        int vi = b - VBASE;
149        *code = SBASE + li * NCOUNT + vi * TCOUNT;
150        return 2;
151    }
152}
153
154static uint32_t decode_utf16(const unsigned short **code_ptr)
155{
156    const unsigned short *code = *code_ptr;
157
158    if ((code[0] & 0xd800) != 0xd800) {
159        *code_ptr += 1;
160        return (uint32_t)code[0];
161    } else {
162        *code_ptr += 2;
163        return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
164            (((uint32_t)code[0] - 0xd800) << 10);
165    }
166}
167
168const char *ucdn_get_unicode_version(void)
169{
170    return UNIDATA_VERSION;
171}
172
173int ucdn_get_combining_class(uint32_t code)
174{
175    return get_ucd_record(code)->combining;
176}
177
178int ucdn_get_east_asian_width(uint32_t code)
179{
180    return get_ucd_record(code)->east_asian_width;
181}
182
183int ucdn_get_general_category(uint32_t code)
184{
185    return get_ucd_record(code)->category;
186}
187
188int ucdn_get_bidi_class(uint32_t code)
189{
190    return get_ucd_record(code)->bidi_class;
191}
192
193int ucdn_get_mirrored(uint32_t code)
194{
195    return get_ucd_record(code)->mirrored;
196}
197
198int ucdn_get_script(uint32_t code)
199{
200    return get_ucd_record(code)->script;
201}
202
203uint32_t ucdn_mirror(uint32_t code)
204{
205    MirrorPair mp = {0};
206    MirrorPair *res;
207
208    if (get_ucd_record(code)->mirrored == 0)
209        return code;
210
211    mp.from = code;
212    res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
213            compare_mp);
214
215    if (res == NULL)
216        return code;
217    else
218        return res->to;
219}
220
221int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
222{
223    const unsigned short *rec;
224    int len;
225
226    if (hangul_pair_decompose(code, a, b))
227        return 1;
228
229    rec = get_decomp_record(code);
230    len = rec[0] >> 8;
231
232    if ((rec[0] & 0xff) != 0 || len == 0)
233        return 0;
234
235    rec++;
236    *a = decode_utf16(&rec);
237    if (len > 1)
238        *b = decode_utf16(&rec);
239    else
240        *b = 0;
241
242    return 1;
243}
244
245int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
246{
247    int l, r, index, indexi, offset;
248
249    if (hangul_pair_compose(code, a, b))
250        return 1;
251
252    l = get_comp_index(a, nfc_first);
253    r = get_comp_index(b, nfc_last);
254
255    if (l < 0 || r < 0)
256        return 0;
257
258    indexi = l * TOTAL_LAST + r;
259    index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
260    offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
261    index  = comp_index1[index + offset] << COMP_SHIFT2;
262    offset = indexi & ((1<<COMP_SHIFT2) - 1);
263    *code  = comp_data[index + offset];
264
265    return *code != 0;
266}
267
268int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
269{
270    int i, len;
271    const unsigned short *rec = get_decomp_record(code);
272    len = rec[0] >> 8;
273
274    if (len == 0)
275        return 0;
276
277    rec++;
278    for (i = 0; i < len; i++)
279        decomposed[i] = decode_utf16(&rec);
280
281    return len;
282}
283