PhoneticStringUtilsTest.cpp revision 455ed29fb92a9adf411252df5e74541269d10806
1/* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "PhoneticStringUtils.h" 18 19#include <stdio.h> 20#include <stdlib.h> 21#include <string.h> 22 23using namespace android; 24 25class TestExecutor { 26 public: 27 TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {} 28 bool DoAllTests(); 29 private: 30 void DoOneTest(void (TestExecutor::*test)()); 31 32 void testGetCodePointFromUtf8(); 33 void testGetPhoneticallySortableCodePointAscii(); 34 void testGetPhoneticallySortableCodePointKana(); 35 void testGetPhoneticallySortableCodePointSimpleCompare(); 36 void testGetUtf8FromCodePoint(); 37 void testGetPhoneticallySortableString(); 38 39 // Note: When adding a test, do not forget to add it to DoOneTest(). 40 41 int m_total_count; 42 int m_success_count; 43 44 bool m_success; 45}; 46 47#define ASSERT_EQ_VALUE(input, expected) \ 48 ({ \ 49 if ((expected) != (input)) { \ 50 printf("0x%X(result) != 0x%X(expected)\n", input, expected); \ 51 m_success = false; \ 52 return; \ 53 } \ 54 }) 55 56#define EXPECT_EQ_VALUE(input, expected) \ 57 ({ \ 58 if ((expected) != (input)) { \ 59 printf("0x%X(result) != 0x%X(expected)\n", input, expected); \ 60 m_success = false; \ 61 } \ 62 }) 63 64 65bool TestExecutor::DoAllTests() { 66 DoOneTest(&TestExecutor::testGetCodePointFromUtf8); 67 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii); 68 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana); 69 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare); 70 DoOneTest(&TestExecutor::testGetUtf8FromCodePoint); 71 DoOneTest(&TestExecutor::testGetPhoneticallySortableString); 72 73 printf("Test total: %d\nSuccess: %d\nFailure: %d\n", 74 m_total_count, m_success_count, m_total_count - m_success_count); 75 76 bool success = m_total_count == m_success_count; 77 printf("\n%s\n", success ? "Success" : "Failure"); 78 79 return success; 80} 81 82void TestExecutor::DoOneTest(void (TestExecutor::*test)()) { 83 m_success = true; 84 85 (this->*test)(); 86 87 ++m_total_count; 88 m_success_count += m_success ? 1 : 0; 89} 90 91void TestExecutor::testGetCodePointFromUtf8() { 92 printf("testGetCodePointFromUtf8()\n"); 93 int next; 94 95 EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97); 96 EXPECT_EQ_VALUE(next, 1); 97 // Japanese hiragana "a" 98 EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042); 99 EXPECT_EQ_VALUE(next, 3); 100 // Japanese fullwidth katakana "a" with ascii a 101 EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2); 102 EXPECT_EQ_VALUE(next, 4); 103 104 // 2 PUA 105 ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 106 8, 0, &next), 0xFE000); 107 ASSERT_EQ_VALUE(next, 4); 108 ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 109 8, next, &next), 0xFE008); 110 ASSERT_EQ_VALUE(next, 8); 111} 112 113void TestExecutor::testGetPhoneticallySortableCodePointAscii() { 114 printf("testGetPhoneticallySortableCodePoint()\n"); 115 int halfwidth[94]; 116 int fullwidth[94]; 117 int i, codepoint; 118 bool next_is_consumed; 119 for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) { 120 halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1, 121 &next_is_consumed); 122 if (halfwidth[i] < 0) { 123 printf("returned value become negative at 0x%04X", codepoint); 124 } 125 if (next_is_consumed) { 126 printf("next_is_consumed become true at 0x%04X", codepoint); 127 m_success = false; 128 return; 129 } 130 } 131 for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) { 132 fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1, 133 &next_is_consumed); 134 if (fullwidth[i] < 0) { 135 printf("returned value become negative at 0x%04X", codepoint); 136 } 137 if (next_is_consumed) { 138 printf("next_is_consumed become true at 0x%04X", codepoint); 139 m_success = false; 140 return; 141 } 142 } 143 144 for (i = 0; i < 94; i++) { 145 EXPECT_EQ_VALUE(halfwidth[i], fullwidth[i]); 146 } 147} 148 149void TestExecutor::testGetPhoneticallySortableCodePointKana() { 150 printf("testGetPhoneticallySortableCodePointKana()\n"); 151 int hiragana[86]; 152 int fullwidth_katakana[86]; 153 int i, codepoint; 154 bool next_is_consumed; 155 156 for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) { 157 hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, -1, 158 &next_is_consumed); 159 if (hiragana[i] < 0) { 160 printf("returned value become negative at 0x%04X", codepoint); 161 } 162 if (next_is_consumed) { 163 printf("next_is_consumed become true at 0x%04X", codepoint); 164 m_success = false; 165 return; 166 } 167 } 168 169 for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) { 170 fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, -1, 171 &next_is_consumed); 172 if (fullwidth_katakana[i] < 0) { 173 printf("returned value become negative at 0x%04X", codepoint); 174 } 175 if (next_is_consumed) { 176 printf("next_is_consumed become true at 0x%04X", codepoint); 177 m_success = false; 178 return; 179 } 180 } 181 182 // hankaku-katakana space do not have some characters corresponding to 183 // zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert 184 // zenkaku-katakana version of them into this array (See the value 0x30??). 185 int halfwidth_katakana[] = { 186 0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B, 187 0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78, 188 0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B, 189 0xFF9E, 0xFF7C, 0xFF7C, 0xFF9E, 0xFF7D, 0xFF7D, 0xFF9E, 0xFF7E, 0xFF7E, 190 0xFF9E, 0xFF7F, 0xFF7F, 0xFF9E, 0xFF80, 0xFF80, 0xFF9E, 0xFF81, 0xFF81, 191 0xFF9E, 0xFF6F, 0xFF82, 0xFF82, 0xFF9E, 0xFF83, 0xFF83, 0xFF9E, 0xFF84, 192 0xFF84, 0xFF9E, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8A, 193 0xFF9E, 0xFF8A, 0xFF9F, 0xFF8B, 0xFF8B, 0xFF9E, 0xFF8B, 0xFF9F, 0xFF8C, 194 0xFF8C, 0xFF9E, 0xFF8C, 0xFF9F, 0xFF8D, 0xFF8D, 0xFF9E, 0xFF8D, 0xFF9F, 195 0xFF8E, 0xFF8E, 0xFF9E, 0xFF8E, 0xFF9F, 0xFF8F, 0xFF90, 0xFF91, 0xFF92, 196 0xFF93, 0xFF6C, 0xFF94, 0xFF6D, 0xFF95, 0xFF6E, 0xFF96, 0xFF97, 0xFF98, 197 0xFF99, 0xFF9A, 0xFF9B, 0x30EE, 0xFF9C, 0x30F0, 0x30F1, 0xFF66, 0xFF9D, 198 0xFF73, 0xFF9E, 0x30F5, 0x30F6}; 199 int len = sizeof(halfwidth_katakana)/sizeof(int); 200 201 int halfwidth_katakana_result[86]; 202 203 int j; 204 for (i = 0, j = 0; i < len && j < 86; ++i, ++j) { 205 int codepoint = halfwidth_katakana[i]; 206 int next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : -1; 207 halfwidth_katakana_result[j] = 208 GetPhoneticallySortableCodePoint(codepoint, next_codepoint, 209 &next_is_consumed); 210 // Consume voiced mark/half-voiced mark. 211 if (next_is_consumed) { 212 ++i; 213 } 214 } 215 ASSERT_EQ_VALUE(i, len); 216 ASSERT_EQ_VALUE(j, 86); 217 218 for (i = 0; i < 86; ++i) { 219 EXPECT_EQ_VALUE(fullwidth_katakana[i], hiragana[i]); 220 EXPECT_EQ_VALUE(halfwidth_katakana_result[i], hiragana[i]); 221 } 222} 223 224void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() { 225 printf("testGetPhoneticallySortableCodePointSimpleCompare()\n"); 226 227 int codepoints[] = { 228 0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071, 229 0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z', 230 '0', '9', '!', '/', ':', '?', '[', '`', '{', '~'}; 231 size_t len = sizeof(codepoints)/sizeof(int); 232 bool next_is_consumed; 233 for (size_t i = 0; i < len - 1; ++i) { 234 int codepoint_a = 235 GetPhoneticallySortableCodePoint(codepoints[i], -1, 236 &next_is_consumed); 237 if (next_is_consumed) { 238 printf("next_is_consumed become true at 0x%04X", codepoint_a); 239 m_success = false; 240 return; 241 } 242 int codepoint_b = 243 GetPhoneticallySortableCodePoint(codepoints[i + 1], -1, 244 &next_is_consumed); 245 if (next_is_consumed) { 246 printf("next_is_consumed become true at 0x%04X", codepoint_b); 247 m_success = false; 248 return; 249 } 250 251 if (codepoint_a >= codepoint_b) { 252 printf("0x%04X (from 0x%04X) >= 0x%04X (from 0x%04X)\n", 253 codepoint_a, codepoints[i], codepoint_b, codepoints[i + 1]); 254 m_success = false; 255 return; 256 } 257 } 258} 259 260#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i) \ 261 ({ \ 262 index = i; \ 263 if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) { \ 264 printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \ 265 m_success = false; \ 266 } else if (index >= 10) { \ 267 printf("index (%d) >= 10\n", index); \ 268 m_success = false; \ 269 } else { \ 270 dst[index] = '\0'; \ 271 if (strcmp(dst + i, expected) != 0) { \ 272 printf("Failed at codepoint 0x%04X\n", codepoint); \ 273 for (const char *ch = dst; *ch != '\0'; ++ch) { \ 274 printf("0x%X ", *ch); \ 275 } \ 276 printf("!= "); \ 277 for (const char *ch = expected; *ch != '\0'; ++ch) { \ 278 printf("0x%X ", *ch); \ 279 } \ 280 printf("\n"); \ 281 m_success = false; \ 282 } \ 283 } \ 284 }) 285 286#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \ 287 EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0) 288 289 290void TestExecutor::testGetUtf8FromCodePoint() { 291 printf("testGetUtf8FromCodePoint()\n"); 292 size_t index = 0; 293 char dst[10]; 294 295 EXPECT_EQ_CODEPOINT_UTF8('a', "\x61"); 296 // Armenian capital letter AYB (2 bytes in UTF8) 297 EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0"); 298 // Japanese 'a' (3 bytes in UTF8) 299 EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82"); 300 // Kanji 301 EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5"); 302 // PUA (4 byets in UTF8) 303 EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96"); 304 EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2"); 305 306 EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3); 307 308 index = 0; 309 if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) { 310 printf("GetUtf8FromCodePont() returned true even when destination length" 311 "is not enough\n"); 312 m_success = false; 313 } 314} 315 316#define EXPECT_EQ_UTF8_UTF8(src, expected) \ 317 ({ \ 318 if (!GetPhoneticallySortableString(src, &dst, &len)) { \ 319 printf("GetPhoneticallySortableString() returned false.\n"); \ 320 m_success = false; \ 321 } else { \ 322 if (strcmp(dst, expected) != 0) { \ 323 for (const char *ch = dst; *ch != '\0'; ++ch) { \ 324 printf("0x%X ", *ch); \ 325 } \ 326 printf("!= "); \ 327 for (const char *ch = expected; *ch != '\0'; ++ch) { \ 328 printf("0x%X ", *ch); \ 329 } \ 330 printf("\n"); \ 331 m_success = false; \ 332 } \ 333 free(dst); \ 334 } \ 335 }) 336 337void TestExecutor::testGetPhoneticallySortableString() { 338 char *dst; 339 size_t len; 340 341 // halfwidth alphabets -> fullwidth alphabets. 342 EXPECT_EQ_UTF8_UTF8("ABCD", 343 "\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3\xEF\xBC\xA4"); 344 // halfwidth/fullwidth-katakana -> hiragana 345 EXPECT_EQ_UTF8_UTF8( 346 "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA", 347 "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A"); 348} 349 350int main() { 351 TestExecutor executor; 352 if(executor.DoAllTests()) { 353 return 0; 354 } else { 355 return 1; 356 } 357} 358