PhoneticStringUtilsTest.cpp revision 455ed29fb92a9adf411252df5e74541269d10806
1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "PhoneticStringUtils.h"
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22
23using namespace android;
24
25class TestExecutor {
26 public:
27  TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
28  bool DoAllTests();
29 private:
30  void DoOneTest(void (TestExecutor::*test)());
31
32  void testGetCodePointFromUtf8();
33  void testGetPhoneticallySortableCodePointAscii();
34  void testGetPhoneticallySortableCodePointKana();
35  void testGetPhoneticallySortableCodePointSimpleCompare();
36  void testGetUtf8FromCodePoint();
37  void testGetPhoneticallySortableString();
38
39  // Note: When adding a test, do not forget to add it to DoOneTest().
40
41  int m_total_count;
42  int m_success_count;
43
44  bool m_success;
45};
46
47#define ASSERT_EQ_VALUE(input, expected)                                \
48  ({                                                                    \
49    if ((expected) != (input)) {                                        \
50      printf("0x%X(result) != 0x%X(expected)\n", input, expected);      \
51      m_success = false;                                                \
52      return;                                                           \
53    }                                                                   \
54  })
55
56#define EXPECT_EQ_VALUE(input, expected)                                \
57  ({                                                                    \
58    if ((expected) != (input)) {                                        \
59      printf("0x%X(result) != 0x%X(expected)\n", input, expected);      \
60      m_success = false;                                                \
61    }                                                                   \
62  })
63
64
65bool TestExecutor::DoAllTests() {
66  DoOneTest(&TestExecutor::testGetCodePointFromUtf8);
67  DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
68  DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
69  DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
70  DoOneTest(&TestExecutor::testGetUtf8FromCodePoint);
71  DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
72
73  printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
74         m_total_count, m_success_count, m_total_count - m_success_count);
75
76  bool success = m_total_count == m_success_count;
77  printf("\n%s\n", success ? "Success" : "Failure");
78
79  return success;
80}
81
82void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
83  m_success = true;
84
85  (this->*test)();
86
87  ++m_total_count;
88  m_success_count += m_success ? 1 : 0;
89}
90
91void TestExecutor::testGetCodePointFromUtf8() {
92  printf("testGetCodePointFromUtf8()\n");
93  int next;
94
95  EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97);
96  EXPECT_EQ_VALUE(next, 1);
97  // Japanese hiragana "a"
98  EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042);
99  EXPECT_EQ_VALUE(next, 3);
100  // Japanese fullwidth katakana "a" with ascii a
101  EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2);
102  EXPECT_EQ_VALUE(next, 4);
103
104  // 2 PUA
105  ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
106                                       8, 0, &next), 0xFE000);
107  ASSERT_EQ_VALUE(next, 4);
108  ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88",
109                                       8, next, &next), 0xFE008);
110  ASSERT_EQ_VALUE(next, 8);
111}
112
113void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
114  printf("testGetPhoneticallySortableCodePoint()\n");
115  int halfwidth[94];
116  int fullwidth[94];
117  int i, codepoint;
118  bool next_is_consumed;
119  for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) {
120    halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
121                                                    &next_is_consumed);
122    if (halfwidth[i] < 0) {
123      printf("returned value become negative at 0x%04X", codepoint);
124    }
125    if (next_is_consumed) {
126      printf("next_is_consumed become true at 0x%04X", codepoint);
127      m_success = false;
128      return;
129    }
130  }
131  for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) {
132    fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
133                                                    &next_is_consumed);
134    if (fullwidth[i] < 0) {
135      printf("returned value become negative at 0x%04X", codepoint);
136    }
137    if (next_is_consumed) {
138      printf("next_is_consumed become true at 0x%04X", codepoint);
139      m_success = false;
140      return;
141    }
142  }
143
144  for (i = 0; i < 94; i++) {
145    EXPECT_EQ_VALUE(halfwidth[i], fullwidth[i]);
146  }
147}
148
149void TestExecutor::testGetPhoneticallySortableCodePointKana() {
150  printf("testGetPhoneticallySortableCodePointKana()\n");
151  int hiragana[86];
152  int fullwidth_katakana[86];
153  int i, codepoint;
154  bool next_is_consumed;
155
156  for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) {
157    hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
158                                                   &next_is_consumed);
159    if (hiragana[i] < 0) {
160      printf("returned value become negative at 0x%04X", codepoint);
161    }
162    if (next_is_consumed) {
163      printf("next_is_consumed become true at 0x%04X", codepoint);
164      m_success = false;
165      return;
166    }
167  }
168
169  for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) {
170    fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, -1,
171                                                   &next_is_consumed);
172    if (fullwidth_katakana[i] < 0) {
173      printf("returned value become negative at 0x%04X", codepoint);
174    }
175    if (next_is_consumed) {
176      printf("next_is_consumed become true at 0x%04X", codepoint);
177      m_success = false;
178      return;
179    }
180  }
181
182  // hankaku-katakana space do not have some characters corresponding to
183  // zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert
184  // zenkaku-katakana version of them into this array (See the value 0x30??).
185  int halfwidth_katakana[] = {
186    0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B,
187    0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78,
188    0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B,
189    0xFF9E, 0xFF7C, 0xFF7C, 0xFF9E, 0xFF7D, 0xFF7D, 0xFF9E, 0xFF7E, 0xFF7E,
190    0xFF9E, 0xFF7F, 0xFF7F, 0xFF9E, 0xFF80, 0xFF80, 0xFF9E, 0xFF81, 0xFF81,
191    0xFF9E, 0xFF6F, 0xFF82, 0xFF82, 0xFF9E, 0xFF83, 0xFF83, 0xFF9E, 0xFF84,
192    0xFF84, 0xFF9E, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8A,
193    0xFF9E, 0xFF8A, 0xFF9F, 0xFF8B, 0xFF8B, 0xFF9E, 0xFF8B, 0xFF9F, 0xFF8C,
194    0xFF8C, 0xFF9E, 0xFF8C, 0xFF9F, 0xFF8D, 0xFF8D, 0xFF9E, 0xFF8D, 0xFF9F,
195    0xFF8E, 0xFF8E, 0xFF9E, 0xFF8E, 0xFF9F, 0xFF8F, 0xFF90, 0xFF91, 0xFF92,
196    0xFF93, 0xFF6C, 0xFF94, 0xFF6D, 0xFF95, 0xFF6E, 0xFF96, 0xFF97, 0xFF98,
197    0xFF99, 0xFF9A, 0xFF9B, 0x30EE, 0xFF9C, 0x30F0, 0x30F1, 0xFF66, 0xFF9D,
198    0xFF73, 0xFF9E, 0x30F5, 0x30F6};
199  int len = sizeof(halfwidth_katakana)/sizeof(int);
200
201  int halfwidth_katakana_result[86];
202
203  int j;
204  for (i = 0, j = 0; i < len && j < 86; ++i, ++j) {
205    int codepoint = halfwidth_katakana[i];
206    int next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : -1;
207    halfwidth_katakana_result[j] =
208        GetPhoneticallySortableCodePoint(codepoint, next_codepoint,
209                                         &next_is_consumed);
210    // Consume voiced mark/half-voiced mark.
211    if (next_is_consumed) {
212      ++i;
213    }
214  }
215  ASSERT_EQ_VALUE(i, len);
216  ASSERT_EQ_VALUE(j, 86);
217
218  for (i = 0; i < 86; ++i) {
219    EXPECT_EQ_VALUE(fullwidth_katakana[i], hiragana[i]);
220    EXPECT_EQ_VALUE(halfwidth_katakana_result[i], hiragana[i]);
221  }
222}
223
224void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() {
225  printf("testGetPhoneticallySortableCodePointSimpleCompare()\n");
226
227  int codepoints[] = {
228    0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071,
229    0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z',
230    '0', '9', '!', '/', ':', '?', '[', '`', '{', '~'};
231  size_t len = sizeof(codepoints)/sizeof(int);
232  bool next_is_consumed;
233  for (size_t i = 0; i < len - 1; ++i) {
234    int codepoint_a =
235        GetPhoneticallySortableCodePoint(codepoints[i], -1,
236                                         &next_is_consumed);
237    if (next_is_consumed) {
238      printf("next_is_consumed become true at 0x%04X", codepoint_a);
239      m_success = false;
240      return;
241    }
242    int codepoint_b =
243        GetPhoneticallySortableCodePoint(codepoints[i + 1], -1,
244                                         &next_is_consumed);
245    if (next_is_consumed) {
246      printf("next_is_consumed become true at 0x%04X", codepoint_b);
247      m_success = false;
248      return;
249    }
250
251    if (codepoint_a >= codepoint_b) {
252      printf("0x%04X (from 0x%04X) >= 0x%04X (from 0x%04X)\n",
253             codepoint_a, codepoints[i], codepoint_b, codepoints[i + 1]);
254      m_success = false;
255      return;
256    }
257  }
258}
259
260#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i)     \
261  ({                                                                    \
262    index = i;                                                          \
263    if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) {            \
264      printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
265      m_success = false;                                                \
266    } else if (index >= 10) {                                           \
267      printf("index (%d) >= 10\n", index);                              \
268      m_success = false;                                                \
269    } else {                                                            \
270      dst[index] = '\0';                                                \
271      if (strcmp(dst + i, expected) != 0) {                             \
272        printf("Failed at codepoint 0x%04X\n", codepoint);              \
273        for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
274          printf("0x%X ", *ch);                                         \
275        }                                                               \
276        printf("!= ");                                                  \
277        for (const char *ch = expected; *ch != '\0'; ++ch) {            \
278          printf("0x%X ", *ch);                                         \
279        }                                                               \
280        printf("\n");                                                   \
281        m_success = false;                                              \
282      }                                                                 \
283    }                                                                   \
284  })
285
286#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)          \
287  EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0)
288
289
290void TestExecutor::testGetUtf8FromCodePoint() {
291  printf("testGetUtf8FromCodePoint()\n");
292  size_t index = 0;
293  char dst[10];
294
295  EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
296  // Armenian capital letter AYB (2 bytes in UTF8)
297  EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0");
298  // Japanese 'a' (3 bytes in UTF8)
299  EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82");
300  // Kanji
301  EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5");
302  // PUA (4 byets in UTF8)
303  EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
304  EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
305
306  EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3);
307
308  index = 0;
309  if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) {
310    printf("GetUtf8FromCodePont() returned true even when destination length"
311           "is not enough\n");
312    m_success = false;
313  }
314}
315
316#define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
317  ({                                                                    \
318    if (!GetPhoneticallySortableString(src, &dst, &len)) {              \
319      printf("GetPhoneticallySortableString() returned false.\n");      \
320      m_success = false;                                                \
321    } else {                                                            \
322      if (strcmp(dst, expected) != 0) {                                 \
323        for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
324          printf("0x%X ", *ch);                                         \
325        }                                                               \
326        printf("!= ");                                                  \
327        for (const char *ch = expected; *ch != '\0'; ++ch) {            \
328          printf("0x%X ", *ch);                                         \
329        }                                                               \
330        printf("\n");                                                   \
331        m_success = false;                                              \
332      }                                                                 \
333      free(dst);                                                        \
334    }                                                                   \
335   })
336
337void TestExecutor::testGetPhoneticallySortableString() {
338  char *dst;
339  size_t len;
340
341  // halfwidth alphabets -> fullwidth alphabets.
342  EXPECT_EQ_UTF8_UTF8("ABCD",
343                      "\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3\xEF\xBC\xA4");
344  // halfwidth/fullwidth-katakana -> hiragana
345  EXPECT_EQ_UTF8_UTF8(
346      "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
347      "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
348}
349
350int main() {
351  TestExecutor executor;
352  if(executor.DoAllTests()) {
353    return 0;
354  } else {
355    return 1;
356  }
357}
358