PhoneticStringUtils.cpp revision cc6719f08251a892e435f8d9d44e9d8fa18d7cbe
1/* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <stdio.h> 18#include <stdlib.h> 19 20#include "PhoneticStringUtils.h" 21#include <utils/String8.h> 22 23// We'd like 0 length string last of sorted list. So when input string is NULL 24// or 0 length string, we use these instead. 25#define CODEPOINT_FOR_NULL_STR 0xFFFD 26#define STR_FOR_NULL_STR "\xEF\xBF\xBD" 27 28// We assume that users will not notice strings not sorted properly when the 29// first 128 characters are the same. 30#define MAX_CODEPOINTS 128 31 32namespace android { 33 34// Get hiragana from halfwidth katakana. 35static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint, 36 char32_t next_codepoint, 37 bool *next_is_consumed) { 38 if (codepoint < 0xFF66 || 0xFF9F < codepoint) { 39 return codepoint; 40 } 41 42 switch (codepoint) { 43 case 0xFF66: // wo 44 return 0x3092; 45 case 0xFF67: // xa 46 return 0x3041; 47 case 0xFF68: // xi 48 return 0x3043; 49 case 0xFF69: // xu 50 return 0x3045; 51 case 0xFF6A: // xe 52 return 0x3047; 53 case 0xFF6B: // xo 54 return 0x3049; 55 case 0xFF6C: // xya 56 return 0x3083; 57 case 0xFF6D: // xyu 58 return 0x3085; 59 case 0xFF6E: // xyo 60 return 0x3087; 61 case 0xFF6F: // xtsu 62 return 0x3063; 63 case 0xFF70: // - 64 return 0x30FC; 65 case 0xFF9C: // wa 66 return 0x308F; 67 case 0xFF9D: // n 68 return 0x3093; 69 break; 70 default: { 71 if (0xFF71 <= codepoint && codepoint <= 0xFF75) { 72 // a, i, u, e, o 73 if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) { 74 if (next_is_consumed != NULL) { 75 *next_is_consumed = true; 76 } 77 return 0x3094; // vu 78 } else { 79 return 0x3042 + (codepoint - 0xFF71) * 2; 80 } 81 } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) { 82 // ka - chi 83 if (next_codepoint == 0xFF9E) { 84 // "dakuten" (voiced mark) 85 if (next_is_consumed != NULL) { 86 *next_is_consumed = true; 87 } 88 return 0x304B + (codepoint - 0xFF76) * 2 + 1; 89 } else { 90 return 0x304B + (codepoint - 0xFF76) * 2; 91 } 92 } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) { 93 // tsu, te, to (skip xtsu) 94 if (next_codepoint == 0xFF9E) { 95 // "dakuten" (voiced mark) 96 if (next_is_consumed != NULL) { 97 *next_is_consumed = true; 98 } 99 return 0x3064 + (codepoint - 0xFF82) * 2 + 1; 100 } else { 101 return 0x3064 + (codepoint - 0xFF82) * 2; 102 } 103 } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) { 104 // na, ni, nu, ne, no 105 return 0x306A + (codepoint - 0xFF85); 106 } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) { 107 // ha, hi, hu, he, ho 108 if (next_codepoint == 0xFF9E) { 109 // "dakuten" (voiced mark) 110 if (next_is_consumed != NULL) { 111 *next_is_consumed = true; 112 } 113 return 0x306F + (codepoint - 0xFF8A) * 3 + 1; 114 } else if (next_codepoint == 0xFF9F) { 115 // "han-dakuten" (half voiced mark) 116 if (next_is_consumed != NULL) { 117 *next_is_consumed = true; 118 } 119 return 0x306F + (codepoint - 0xFF8A) * 3 + 2; 120 } else { 121 return 0x306F + (codepoint - 0xFF8A) * 3; 122 } 123 } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) { 124 // ma, mi, mu, me, mo 125 return 0x307E + (codepoint - 0xFF8F); 126 } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) { 127 // ya, yu, yo 128 return 0x3084 + (codepoint - 0xFF94) * 2; 129 } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) { 130 // ra, ri, ru, re, ro 131 return 0x3089 + (codepoint - 0xFF97); 132 } 133 // Note: 0xFF9C, 0xFF9D are handled above 134 } // end of default 135 } 136 137 return codepoint; 138} 139 140// Assuming input is hiragana, convert the hiragana to "normalized" hiragana. 141static int GetNormalizedHiragana(int codepoint) { 142 if (codepoint < 0x3040 || 0x309F < codepoint) { 143 return codepoint; 144 } 145 146 // TODO: should care (semi-)voiced mark (0x3099, 0x309A). 147 148 // Trivial kana conversions. 149 // e.g. xa => a 150 switch (codepoint) { 151 case 0x3041: 152 case 0x3043: 153 case 0x3045: 154 case 0x3047: 155 case 0x3049: 156 case 0x308E: // xwa 157 return codepoint + 1; 158 case 0x3095: // xka 159 return 0x304B; 160 case 0x3096: // xku 161 return 0x304F; 162 default: 163 return codepoint; 164 } 165} 166 167static int GetNormalizedKana(char32_t codepoint, 168 char32_t next_codepoint, 169 bool *next_is_consumed) { 170 // First, convert fullwidth katakana and halfwidth katakana to hiragana. 171 if (0x30A1 <= codepoint && codepoint <= 0x30F6) { 172 // Make fullwidth katakana same as hiragana. 173 // 96 == 0x30A1 - 0x3041c 174 codepoint = codepoint - 96; 175 } else { 176 codepoint = GetHiraganaFromHalfwidthKatakana( 177 codepoint, next_codepoint, next_is_consumed); 178 } 179 180 // Normalize Hiragana. 181 return GetNormalizedHiragana(codepoint); 182} 183 184int GetPhoneticallySortableCodePoint(char32_t codepoint, 185 char32_t next_codepoint, 186 bool *next_is_consumed) { 187 if (next_is_consumed != NULL) { 188 *next_is_consumed = false; 189 } 190 191 if (codepoint <= 0x0020 || codepoint == 0x3000) { 192 // Whitespace should be ignored. 193 // Note: Formally, more "whitespace" exist. This block only 194 // handles part of them 195 return -1; 196 } else if ((0x0021 <= codepoint && codepoint <= 0x007E) || 197 (0xFF01 <= codepoint && codepoint <= 0xFF5E)) { 198 // Ascii and fullwidth ascii 199 200 if (0x0021 <= codepoint && codepoint <= 0x007E) { 201 // Convert ascii to fullwidth ascii so that they become 202 // behind hiragana. 203 // 65248 = 0xFF01 - 0x0021 204 codepoint += 65248; 205 } 206 207 // Now, there is only fullwidth ascii. 208 if (0xFF10 <= codepoint && codepoint <= 0xFF19) { 209 // Numbers should be after alphabets but before symbols. 210 // 86 = 0xFF66 211 // (the beginning of halfwidth-katakankana space) - 0xFF10 212 return codepoint + 86; 213 } else if (0xFF41 <= codepoint && codepoint <= 0xFF5A) { 214 // Make lower alphabets same as capital alphabets. 215 // 32 = 0xFF41 - 0xFF21 216 return codepoint - 32; 217 } else if (0xFF01 <= codepoint && codepoint <= 0xFF0F) { 218 // Symbols (Ascii except alphabet nor number) 219 // These should be at the end of sorting, just after numebers 220 // (see below) 221 // 222 // We use halfwidth-katakana space for storing those symbols. 223 // 111 = 0xFF70 (0xFF19 + 86 + 1) - 0xFF01 224 return codepoint + 111; 225 } else if (0xFF1A <= codepoint && codepoint <= 0xFF20) { 226 // Symbols (cont.) 227 // 101 = 0xFF7F (0xFF0F + 111 + 1) - 0xFF1A 228 return codepoint + 101; 229 } else if (0xFF3B <= codepoint && codepoint <= 0xFF40) { 230 // Symbols (cont.) 231 // 75 = 0xFF86 (0xFF20 + 101 + 1) - 0xFF3B (= 101 - 26) 232 return codepoint + 75; 233 } else if (0xFF5B <= codepoint && codepoint <= 0xFF5E) { 234 // Symbols (cont.) 235 // 49 = 0xFF8C (0xFF40 + 75 + 1) - 0xFF5B (= 75 - 26) 236 return codepoint + 49; 237 } else { 238 return codepoint; 239 } 240 } else if (codepoint == 0x02DC || codepoint == 0x223C) { 241 // tilde 242 return 0xFF5E; 243 } else if (codepoint <= 0x3040 || 244 (0x3100 <= codepoint && codepoint < 0xFF00) || 245 codepoint == CODEPOINT_FOR_NULL_STR) { 246 // Move Kanji and other non-Japanese characters behind symbols. 247 return codepoint + 0x10000; 248 } 249 250 // Below is Kana-related handling. 251 252 return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed); 253} 254 255int GetNormalizedCodePoint(char32_t codepoint, 256 char32_t next_codepoint, 257 bool *next_is_consumed) { 258 if (next_is_consumed != NULL) { 259 *next_is_consumed = false; 260 } 261 262 if (codepoint <= 0x0020 || codepoint == 0x3000) { 263 // Whitespaces. Keep it as is. 264 return codepoint; 265 } else if ((0x0021 <= codepoint && codepoint <= 0x007E) || 266 (0xFF01 <= codepoint && codepoint <= 0xFF5E)) { 267 // Ascii and fullwidth ascii. Keep it as is 268 return codepoint; 269 } else if (codepoint == 0x02DC || codepoint == 0x223C) { 270 // tilde 271 return 0xFF5E; 272 } else if (codepoint <= 0x3040 || 273 (0x3100 <= codepoint && codepoint < 0xFF00) || 274 codepoint == CODEPOINT_FOR_NULL_STR) { 275 // Keep it as is. 276 return codepoint; 277 } 278 279 // Below is Kana-related handling. 280 281 return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed); 282} 283 284static bool GetExpectedString( 285 const char *src, char **dst, size_t *dst_len, 286 int (*get_codepoint_function)(char32_t, char32_t, bool*)) { 287 if (dst == NULL || dst_len == NULL) { 288 return false; 289 } 290 291 if (src == NULL || *src == '\0') { 292 src = STR_FOR_NULL_STR; 293 } 294 295 char32_t codepoints[MAX_CODEPOINTS]; // if array size is changed the for loop needs to be changed 296 297 size_t src_len = utf8_length(src); 298 if (src_len == 0) { 299 return false; 300 } 301 bool next_is_consumed; 302 size_t j = 0; 303 for (size_t i = 0; i < src_len && j < MAX_CODEPOINTS;) { 304 int32_t ret = utf32_at(src, src_len, i, &i); 305 if (ret < 0) { 306 // failed to parse UTF-8 307 return false; 308 } 309 ret = get_codepoint_function( 310 static_cast<char32_t>(ret), 311 i + 1 < src_len ? src[i + 1] : 0, 312 &next_is_consumed); 313 if (ret > 0) { 314 codepoints[j] = static_cast<char32_t>(ret); 315 j++; 316 } 317 if (next_is_consumed) { 318 i++; 319 } 320 } 321 size_t length = j; 322 323 if (length == 0) { 324 // If all of codepoints are invalid, we place the string at the end of 325 // the list. 326 codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR; 327 length = 1; 328 } 329 330 size_t new_len = utf8_length_from_utf32(codepoints, length); 331 *dst = static_cast<char *>(malloc(new_len + 1)); 332 if (*dst == NULL) { 333 return false; 334 } 335 336 if (utf32_to_utf8(codepoints, length, *dst, new_len + 1) != new_len) { 337 free(*dst); 338 *dst = NULL; 339 return false; 340 } 341 342 *dst_len = new_len; 343 return true; 344} 345 346bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len) { 347 return GetExpectedString(src, dst, len, GetPhoneticallySortableCodePoint); 348} 349 350bool GetNormalizedString(const char *src, char **dst, size_t *len) { 351 return GetExpectedString(src, dst, len, GetNormalizedCodePoint); 352} 353 354} // namespace android 355