1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <assert.h>
18#include "../include/splparser.h"
19
20namespace ime_pinyin {
21
22SpellingParser::SpellingParser() {
23  spl_trie_ = SpellingTrie::get_cpinstance();
24}
25
26bool SpellingParser::is_valid_to_parse(char ch) {
27  return SpellingTrie::is_valid_spl_char(ch);
28}
29
30uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
31                                      uint16 spl_idx[], uint16 start_pos[],
32                                      uint16 max_size, bool &last_is_pre) {
33  if (NULL == splstr || 0 == max_size || 0 == str_len)
34    return 0;
35
36  if (!SpellingTrie::is_valid_spl_char(splstr[0]))
37    return 0;
38
39  last_is_pre = false;
40
41  const SpellingNode *node_this = spl_trie_->root_;
42
43  uint16 str_pos = 0;
44  uint16 idx_num = 0;
45  if (NULL != start_pos)
46    start_pos[0] = 0;
47  bool last_is_splitter = false;
48
49  while (str_pos < str_len) {
50    char char_this = splstr[str_pos];
51    // all characters outside of [a, z] are considered as splitters
52    if (!SpellingTrie::is_valid_spl_char(char_this)) {
53      // test if the current node is endable
54      uint16 id_this = node_this->spelling_idx;
55      if (spl_trie_->if_valid_id_update(&id_this)) {
56        spl_idx[idx_num] = id_this;
57
58        idx_num++;
59        str_pos++;
60        if (NULL != start_pos)
61          start_pos[idx_num] = str_pos;
62        if (idx_num >= max_size)
63          return idx_num;
64
65        node_this = spl_trie_->root_;
66        last_is_splitter = true;
67        continue;
68      } else {
69        if (last_is_splitter) {
70          str_pos++;
71          if (NULL != start_pos)
72            start_pos[idx_num] = str_pos;
73          continue;
74        } else {
75          return idx_num;
76        }
77      }
78    }
79
80    last_is_splitter = false;
81
82    SpellingNode *found_son = NULL;
83
84    if (0 == str_pos) {
85      if (char_this >= 'a')
86        found_son = spl_trie_->level1_sons_[char_this - 'a'];
87      else
88        found_son = spl_trie_->level1_sons_[char_this - 'A'];
89    } else {
90      SpellingNode *first_son = node_this->first_son;
91      // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
92      // frequently used, so we scan from the end.
93      for (int i = 0; i < node_this->num_of_son; i++) {
94        SpellingNode *this_son = first_son + i;
95        if (SpellingTrie::is_same_spl_char(
96            this_son->char_this_node, char_this)) {
97          found_son = this_son;
98          break;
99        }
100      }
101    }
102
103    // found, just move the current node pointer to the the son
104    if (NULL != found_son) {
105      node_this = found_son;
106    } else {
107      // not found, test if it is endable
108      uint16 id_this = node_this->spelling_idx;
109      if (spl_trie_->if_valid_id_update(&id_this)) {
110        // endable, remember the index
111        spl_idx[idx_num] = id_this;
112
113        idx_num++;
114        if (NULL != start_pos)
115          start_pos[idx_num] = str_pos;
116        if (idx_num >= max_size)
117          return idx_num;
118        node_this = spl_trie_->root_;
119        continue;
120      } else {
121        return idx_num;
122      }
123    }
124
125    str_pos++;
126  }
127
128  uint16 id_this = node_this->spelling_idx;
129  if (spl_trie_->if_valid_id_update(&id_this)) {
130    // endable, remember the index
131    spl_idx[idx_num] = id_this;
132
133    idx_num++;
134    if (NULL != start_pos)
135      start_pos[idx_num] = str_pos;
136  }
137
138  last_is_pre = !last_is_splitter;
139
140  return idx_num;
141}
142
143uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
144                                        uint16 spl_idx[], uint16 start_pos[],
145                                        uint16 max_size, bool &last_is_pre) {
146  uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
147                                  max_size, last_is_pre);
148  for (uint16 pos = 0; pos < idx_num; pos++) {
149    if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
150      spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
151      if (pos == idx_num - 1) {
152        last_is_pre = false;
153      }
154    }
155  }
156  return idx_num;
157}
158
159uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
160                                        uint16 spl_idx[], uint16 start_pos[],
161                                        uint16 max_size, bool &last_is_pre) {
162  if (NULL == splstr || 0 == max_size || 0 == str_len)
163    return 0;
164
165  if (!SpellingTrie::is_valid_spl_char(splstr[0]))
166    return 0;
167
168  last_is_pre = false;
169
170  const SpellingNode *node_this = spl_trie_->root_;
171
172  uint16 str_pos = 0;
173  uint16 idx_num = 0;
174  if (NULL != start_pos)
175    start_pos[0] = 0;
176  bool last_is_splitter = false;
177
178  while (str_pos < str_len) {
179    char16 char_this = splstr[str_pos];
180    // all characters outside of [a, z] are considered as splitters
181    if (!SpellingTrie::is_valid_spl_char(char_this)) {
182      // test if the current node is endable
183      uint16 id_this = node_this->spelling_idx;
184      if (spl_trie_->if_valid_id_update(&id_this)) {
185        spl_idx[idx_num] = id_this;
186
187        idx_num++;
188        str_pos++;
189        if (NULL != start_pos)
190          start_pos[idx_num] = str_pos;
191        if (idx_num >= max_size)
192          return idx_num;
193
194        node_this = spl_trie_->root_;
195        last_is_splitter = true;
196        continue;
197      } else {
198        if (last_is_splitter) {
199          str_pos++;
200          if (NULL != start_pos)
201            start_pos[idx_num] = str_pos;
202          continue;
203        } else {
204          return idx_num;
205        }
206      }
207    }
208
209    last_is_splitter = false;
210
211    SpellingNode *found_son = NULL;
212
213    if (0 == str_pos) {
214      if (char_this >= 'a')
215        found_son = spl_trie_->level1_sons_[char_this - 'a'];
216      else
217        found_son = spl_trie_->level1_sons_[char_this - 'A'];
218    } else {
219      SpellingNode *first_son = node_this->first_son;
220      // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
221      // frequently used, so we scan from the end.
222      for (int i = 0; i < node_this->num_of_son; i++) {
223        SpellingNode *this_son = first_son + i;
224        if (SpellingTrie::is_same_spl_char(
225            this_son->char_this_node, char_this)) {
226          found_son = this_son;
227          break;
228        }
229      }
230    }
231
232    // found, just move the current node pointer to the the son
233    if (NULL != found_son) {
234      node_this = found_son;
235    } else {
236      // not found, test if it is endable
237      uint16 id_this = node_this->spelling_idx;
238      if (spl_trie_->if_valid_id_update(&id_this)) {
239        // endable, remember the index
240        spl_idx[idx_num] = id_this;
241
242        idx_num++;
243        if (NULL != start_pos)
244          start_pos[idx_num] = str_pos;
245        if (idx_num >= max_size)
246          return idx_num;
247        node_this = spl_trie_->root_;
248        continue;
249      } else {
250        return idx_num;
251      }
252    }
253
254    str_pos++;
255  }
256
257  uint16 id_this = node_this->spelling_idx;
258  if (spl_trie_->if_valid_id_update(&id_this)) {
259    // endable, remember the index
260    spl_idx[idx_num] = id_this;
261
262    idx_num++;
263    if (NULL != start_pos)
264      start_pos[idx_num] = str_pos;
265  }
266
267  last_is_pre = !last_is_splitter;
268
269  return idx_num;
270}
271
272uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
273                                          uint16 spl_idx[], uint16 start_pos[],
274                                          uint16 max_size, bool &last_is_pre) {
275  uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
276                                    max_size, last_is_pre);
277  for (uint16 pos = 0; pos < idx_num; pos++) {
278    if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
279      spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
280      if (pos == idx_num - 1) {
281        last_is_pre = false;
282      }
283    }
284  }
285  return idx_num;
286}
287
288uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
289                                        bool *is_pre) {
290  if (NULL == is_pre)
291    return 0;
292
293  uint16 spl_idx[2];
294  uint16 start_pos[3];
295
296  if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
297    return 0;
298
299  if (start_pos[1] != str_len)
300    return 0;
301  return spl_idx[0];
302}
303
304uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
305                                          bool *is_pre) {
306  if (NULL == is_pre)
307    return 0;
308
309  uint16 spl_idx[2];
310  uint16 start_pos[3];
311
312  if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
313    return 0;
314
315  if (start_pos[1] != str_len)
316    return 0;
317  if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
318    spl_trie_->half_to_full(spl_idx[0], spl_idx);
319    *is_pre = false;
320  }
321
322  return spl_idx[0];
323}
324
325uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
326    uint16 splidx[], uint16 max_size,
327    uint16 &full_id_num, bool &is_pre) {
328  if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
329    return 0;
330
331  splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
332  full_id_num = 0;
333  if (0 != splidx[0]) {
334    if (splidx[0] >= kFullSplIdStart)
335      full_id_num = 1;
336    return 1;
337  }
338  return 0;
339}
340
341}  // namespace ime_pinyin
342