PhonebookIndex.cpp revision 3a74962298118ee138e290c3458bccb895854b47
1/*
2 * Copyright 2010, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <ctype.h>
18#include <string.h>
19
20#include <unicode/ucol.h>
21#include <unicode/uiter.h>
22#include <unicode/ustring.h>
23#include <unicode/utypes.h>
24
25#include "PhonebookIndex.h"
26#include "PhoneticStringUtils.h"
27
28#define SMALL_BUFFER_SIZE 10
29
30namespace android {
31
32// IMPORTANT!  Keep the codes below SORTED. We are doing a binary search on the array
33static UChar DEFAULT_CHAR_MAP[] = {
34    0x00C6,    'A',       // AE
35    0x00DF,    'S',       // Etzett
36    0x1100, 0x3131,       // HANGUL LETTER KIYEOK
37    0x1101, 0x3132,       // HANGUL LETTER SSANGKIYEOK
38    0x1102, 0x3134,       // HANGUL LETTER NIEUN
39    0x1103, 0x3137,       // HANGUL LETTER TIKEUT
40    0x1104, 0x3138,       // HANGUL LETTER SSANGTIKEUT
41    0x1105, 0x3139,       // HANGUL LETTER RIEUL
42    0x1106, 0x3141,       // HANGUL LETTER MIEUM
43    0x1107, 0x3142,       // HANGUL LETTER PIEUP
44    0x1108, 0x3143,       // HANGUL LETTER SSANGPIEUP
45    0x1109, 0x3145,       // HANGUL LETTER SIOS
46    0x110A, 0x3146,       // HANGUL LETTER SSANGSIOS
47    0x110B, 0x3147,       // HANGUL LETTER IEUNG
48    0x110C, 0x3148,       // HANGUL LETTER CIEUC
49    0x110D, 0x3149,       // HANGUL LETTER SSANGCIEUC
50    0x110E, 0x314A,       // HANGUL LETTER CHIEUCH
51    0x110F, 0x314B,       // HANGUL LETTER KHIEUKH
52    0x1110, 0x314C,       // HANGUL LETTER THIEUTH
53    0x1111, 0x314D,       // HANGUL LETTER PHIEUPH
54    0x1112, 0x314E,       // HANGUL LETTER HIEUH
55    0x111A, 0x3140,       // HANGUL LETTER RIEUL-HIEUH
56    0x1121, 0x3144,       // HANGUL LETTER PIEUP-SIOS
57    0x1161, 0x314F,       // HANGUL LETTER A
58    0x1162, 0x3150,       // HANGUL LETTER AE
59    0x1163, 0x3151,       // HANGUL LETTER YA
60    0x1164, 0x3152,       // HANGUL LETTER YAE
61    0x1165, 0x3153,       // HANGUL LETTER EO
62    0x1166, 0x3154,       // HANGUL LETTER E
63    0x1167, 0x3155,       // HANGUL LETTER YEO
64    0x1168, 0x3156,       // HANGUL LETTER YE
65    0x1169, 0x3157,       // HANGUL LETTER O
66    0x116A, 0x3158,       // HANGUL LETTER WA
67    0x116B, 0x3159,       // HANGUL LETTER WAE
68    0x116C, 0x315A,       // HANGUL LETTER OE
69    0x116D, 0x315B,       // HANGUL LETTER YO
70    0x116E, 0x315C,       // HANGUL LETTER U
71    0x116F, 0x315D,       // HANGUL LETTER WEO
72    0x1170, 0x315E,       // HANGUL LETTER WE
73    0x1171, 0x315F,       // HANGUL LETTER WI
74    0x1172, 0x3160,       // HANGUL LETTER YU
75    0x1173, 0x3161,       // HANGUL LETTER EU
76    0x1174, 0x3162,       // HANGUL LETTER YI
77    0x1175, 0x3163,       // HANGUL LETTER I
78    0x11AA, 0x3133,       // HANGUL LETTER KIYEOK-SIOS
79    0x11AC, 0x3135,       // HANGUL LETTER NIEUN-CIEUC
80    0x11AD, 0x3136,       // HANGUL LETTER NIEUN-HIEUH
81    0x11B0, 0x313A,       // HANGUL LETTER RIEUL-KIYEOK
82    0x11B1, 0x313B,       // HANGUL LETTER RIEUL-MIEUM
83    0x11B3, 0x313D,       // HANGUL LETTER RIEUL-SIOS
84    0x11B4, 0x313E,       // HANGUL LETTER RIEUL-THIEUTH
85    0x11B5, 0x313F,       // HANGUL LETTER RIEUL-PHIEUPH
86};
87
88/**
89 * Binary search to map an individual character to the corresponding phone book index.
90 */
91static UChar map_character(UChar c, UChar * char_map, int32_t length) {
92  int from = 0, to = length;
93  while (from < to) {
94    int m = ((to + from) >> 1) & ~0x1;    // Only consider even positions
95    UChar cm = char_map[m];
96    if (cm == c) {
97      return char_map[m + 1];
98    } else if (cm < c) {
99      from = m + 2;
100    } else {
101      to = m;
102    }
103  }
104  return 0;
105}
106
107/**
108 * Returns TRUE if the character belongs to a Hanzi unicode block
109 */
110static bool is_CJK(UChar c) {
111  return
112       (0x4e00 <= c && c <= 0x9fff)     // CJK_UNIFIED_IDEOGRAPHS
113    || (0x3400 <= c && c <= 0x4dbf)     // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
114    || (0x3000 <= c && c <= 0x303f)     // CJK_SYMBOLS_AND_PUNCTUATION
115    || (0x2e80 <= c && c <= 0x2eff)     // CJK_RADICALS_SUPPLEMENT
116    || (0x3300 <= c && c <= 0x33ff)     // CJK_COMPATIBILITY
117    || (0xfe30 <= c && c <= 0xfe4f)     // CJK_COMPATIBILITY_FORMS
118    || (0xf900 <= c && c <= 0xfaff);    // CJK_COMPATIBILITY_IDEOGRAPHS
119}
120
121UChar GetPhonebookIndex(UCharIterator * iter, const char * locale) {
122    UChar dest[SMALL_BUFFER_SIZE];
123
124    // Normalize the first character to remove accents using the NFD normalization
125    UErrorCode errorCode = U_ZERO_ERROR;
126    int32_t len = unorm_next(iter, dest, SMALL_BUFFER_SIZE * sizeof(UChar), UNORM_NFD,
127            0 /* options */, TRUE /* normalize */, NULL, &errorCode);
128    if (U_FAILURE(errorCode) || len == 0) {
129      return 0;
130    }
131
132    UChar c = dest[0];
133
134    // We are only interested in letters
135    if (!u_isalpha(c)) {
136      return 0;
137    }
138
139    c = u_toupper(c);
140
141    // Check for explicitly mapped characters
142    UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
143    if (c_mapped != 0) {
144      return c_mapped;
145    }
146
147    // Convert Kanas to Hiragana
148    UChar next = len > 2 ? dest[1] : 0;
149    c = android::GetNormalizedCodePoint(c, next, NULL);
150
151    if (is_CJK(c)) {
152      if (strncmp(locale, "ja", 2) == 0) {
153        return 0x8A18;  // Kanji character used as a heading in letters, notices and other documents
154      } else {
155        return 0;
156      }
157    }
158
159    return c;
160}
161
162}  // namespace android
163