1/* 2 * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17#include <stdio.h> 18#include <stdlib.h> 19#include "ucdn.h" 20 21typedef struct { 22 unsigned char category; 23 unsigned char combining; 24 unsigned char bidi_class; 25 unsigned char mirrored; 26 unsigned char east_asian_width; 27 unsigned char normalization_check; 28 unsigned char script; 29} UCDRecord; 30 31typedef struct { 32 unsigned short from, to; 33} MirrorPair; 34 35typedef struct { 36 unsigned int start; 37 short count, index; 38} Reindex; 39 40#include "unicodedata_db.h" 41 42/* constants required for Hangul (de)composition */ 43#define SBASE 0xAC00 44#define LBASE 0x1100 45#define VBASE 0x1161 46#define TBASE 0x11A7 47#define SCOUNT 11172 48#define LCOUNT 19 49#define VCOUNT 21 50#define TCOUNT 28 51#define NCOUNT (VCOUNT * TCOUNT) 52 53static const UCDRecord *get_ucd_record(uint32_t code) 54{ 55 int index, offset; 56 57 if (code >= 0x110000) 58 index = 0; 59 else { 60 index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; 61 offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); 62 index = index1[index + offset] << SHIFT2; 63 offset = code & ((1<<SHIFT2) - 1); 64 index = index2[index + offset]; 65 } 66 67 return &ucd_records[index]; 68} 69 70static const unsigned short *get_decomp_record(uint32_t code) 71{ 72 int index, offset; 73 74 if (code >= 0x110000) 75 index = 0; 76 else { 77 index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] 78 << DECOMP_SHIFT1; 79 offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); 80 index = decomp_index1[index + offset] << DECOMP_SHIFT2; 81 offset = code & ((1<<DECOMP_SHIFT2) - 1); 82 index = decomp_index2[index + offset]; 83 } 84 85 return &decomp_data[index]; 86} 87 88static int get_comp_index(uint32_t code, const Reindex *idx) 89{ 90 int i; 91 92 for (i = 0; idx[i].start; i++) { 93 const Reindex *cur = &idx[i]; 94 if (code < cur->start) 95 return -1; 96 if (code <= cur->start + cur->count) { 97 return cur->index + (code - cur->start); 98 } 99 } 100 101 return -1; 102} 103 104static int compare_mp(const void *a, const void *b) 105{ 106 MirrorPair *mpa = (MirrorPair *)a; 107 MirrorPair *mpb = (MirrorPair *)b; 108 return mpa->from - mpb->from; 109} 110 111static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) 112{ 113 int si = code - SBASE; 114 115 if (si < 0 || si >= SCOUNT) 116 return 0; 117 118 if (si % TCOUNT) { 119 /* LV,T */ 120 *a = SBASE + (si / TCOUNT) * TCOUNT; 121 *b = TBASE + (si % TCOUNT); 122 return 3; 123 } else { 124 /* L,V */ 125 *a = LBASE + (si / NCOUNT); 126 *b = VBASE + (si % NCOUNT) / TCOUNT; 127 return 2; 128 } 129} 130 131static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) 132{ 133 if (b < VBASE || b >= (TBASE + TCOUNT)) 134 return 0; 135 136 if ((a < LBASE || a >= (LBASE + LCOUNT)) 137 && (a < SBASE || a >= (SBASE + SCOUNT))) 138 return 0; 139 140 if (a >= SBASE) { 141 /* LV,T */ 142 *code = a + (b - TBASE); 143 return 3; 144 } else { 145 /* L,V */ 146 int li = a - LBASE; 147 int vi = b - VBASE; 148 *code = SBASE + li * NCOUNT + vi * TCOUNT; 149 return 2; 150 } 151} 152 153static uint32_t decode_utf16(const unsigned short **code_ptr) 154{ 155 const unsigned short *code = *code_ptr; 156 157 if ((code[0] & 0xd800) != 0xd800) { 158 *code_ptr += 1; 159 return (uint32_t)code[0]; 160 } else { 161 *code_ptr += 2; 162 return 0x10000 + ((uint32_t)code[1] - 0xdc00) + 163 (((uint32_t)code[0] - 0xd800) << 10); 164 } 165} 166 167const char *ucdn_get_unicode_version(void) 168{ 169 return UNIDATA_VERSION; 170} 171 172int ucdn_get_combining_class(uint32_t code) 173{ 174 return get_ucd_record(code)->combining; 175} 176 177int ucdn_get_east_asian_width(uint32_t code) 178{ 179 return get_ucd_record(code)->east_asian_width; 180} 181 182int ucdn_get_general_category(uint32_t code) 183{ 184 return get_ucd_record(code)->category; 185} 186 187int ucdn_get_bidi_class(uint32_t code) 188{ 189 return get_ucd_record(code)->bidi_class; 190} 191 192int ucdn_get_mirrored(uint32_t code) 193{ 194 return get_ucd_record(code)->mirrored; 195} 196 197int ucdn_get_script(uint32_t code) 198{ 199 return get_ucd_record(code)->script; 200} 201 202uint32_t ucdn_mirror(uint32_t code) 203{ 204 MirrorPair mp = {0}; 205 MirrorPair *res; 206 207 if (get_ucd_record(code)->mirrored == 0) 208 return code; 209 210 mp.from = code; 211 res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair), 212 compare_mp); 213 214 if (res == NULL) 215 return code; 216 else 217 return res->to; 218} 219 220int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) 221{ 222 const unsigned short *rec; 223 int len; 224 225 if (hangul_pair_decompose(code, a, b)) 226 return 1; 227 228 rec = get_decomp_record(code); 229 len = rec[0] >> 8; 230 231 if ((rec[0] & 0xff) != 0 || len == 0) 232 return 0; 233 234 rec++; 235 *a = decode_utf16(&rec); 236 if (len > 1) 237 *b = decode_utf16(&rec); 238 else 239 *b = 0; 240 241 return 1; 242} 243 244int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) 245{ 246 int l, r, index, indexi, offset; 247 248 if (hangul_pair_compose(code, a, b)) 249 return 1; 250 251 l = get_comp_index(a, nfc_first); 252 r = get_comp_index(b, nfc_last); 253 254 if (l < 0 || r < 0) 255 return 0; 256 257 indexi = l * TOTAL_LAST + r; 258 index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; 259 offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); 260 index = comp_index1[index + offset] << COMP_SHIFT2; 261 offset = indexi & ((1<<COMP_SHIFT2) - 1); 262 *code = comp_data[index + offset]; 263 264 return *code != 0; 265} 266 267int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) 268{ 269 int i, len; 270 const unsigned short *rec = get_decomp_record(code); 271 len = rec[0] >> 8; 272 273 if (len == 0) 274 return 0; 275 276 rec++; 277 for (i = 0; i < len; i++) 278 decomposed[i] = decode_utf16(&rec); 279 280 return len; 281} 282