1#include "pseudolocalize.h"
2
3using namespace std;
4
5// String basis to generate expansion
6static const String16 k_expansion_string = String16("one two three "
7    "four five six seven eight nine ten eleven twelve thirteen "
8    "fourteen fiveteen sixteen seventeen nineteen twenty");
9
10// Special unicode characters to override directionality of the words
11static const String16 k_rlm = String16("\xe2\x80\x8f");
12static const String16 k_rlo = String16("\xE2\x80\xae");
13static const String16 k_pdf = String16("\xE2\x80\xac");
14
15// Placeholder marks
16static const String16 k_placeholder_open = String16("\xc2\xbb");
17static const String16 k_placeholder_close = String16("\xc2\xab");
18
19static const char16_t k_arg_start = '{';
20static const char16_t k_arg_end = '}';
21
22Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m)
23    : mImpl(nullptr), mLastDepth(0) {
24  setMethod(m);
25}
26
27void Pseudolocalizer::setMethod(PseudolocalizationMethod m) {
28  if (mImpl) {
29    delete mImpl;
30  }
31  if (m == PSEUDO_ACCENTED) {
32    mImpl = new PseudoMethodAccent();
33  } else if (m == PSEUDO_BIDI) {
34    mImpl = new PseudoMethodBidi();
35  } else {
36    mImpl = new PseudoMethodNone();
37  }
38}
39
40String16 Pseudolocalizer::text(const String16& text) {
41  String16 out;
42  size_t depth = mLastDepth;
43  size_t lastpos, pos;
44  const size_t length= text.size();
45  const char16_t* str = text.string();
46  bool escaped = false;
47  for (lastpos = pos = 0; pos < length; pos++) {
48    char16_t c = str[pos];
49    if (escaped) {
50      escaped = false;
51      continue;
52    }
53    if (c == '\'') {
54      escaped = true;
55      continue;
56    }
57
58    if (c == k_arg_start) {
59      depth++;
60    } else if (c == k_arg_end && depth) {
61      depth--;
62    }
63
64    if (mLastDepth != depth || pos == length - 1) {
65      bool pseudo = ((mLastDepth % 2) == 0);
66      size_t nextpos = pos;
67      if (!pseudo || depth == mLastDepth) {
68        nextpos++;
69      }
70      size_t size = nextpos - lastpos;
71      if (size) {
72        String16 chunk = String16(text, size, lastpos);
73        if (pseudo) {
74          chunk = mImpl->text(chunk);
75        } else if (str[lastpos] == k_arg_start &&
76                   str[nextpos - 1] == k_arg_end) {
77          chunk = mImpl->placeholder(chunk);
78        }
79        out.append(chunk);
80      }
81      if (pseudo && depth < mLastDepth) { // End of message
82        out.append(mImpl->end());
83      } else if (!pseudo && depth > mLastDepth) { // Start of message
84        out.append(mImpl->start());
85      }
86      lastpos = nextpos;
87      mLastDepth = depth;
88    }
89  }
90  return out;
91}
92
93static const char*
94pseudolocalize_char(const char16_t c)
95{
96    switch (c) {
97        case 'a':   return "\xc3\xa5";
98        case 'b':   return "\xc9\x93";
99        case 'c':   return "\xc3\xa7";
100        case 'd':   return "\xc3\xb0";
101        case 'e':   return "\xc3\xa9";
102        case 'f':   return "\xc6\x92";
103        case 'g':   return "\xc4\x9d";
104        case 'h':   return "\xc4\xa5";
105        case 'i':   return "\xc3\xae";
106        case 'j':   return "\xc4\xb5";
107        case 'k':   return "\xc4\xb7";
108        case 'l':   return "\xc4\xbc";
109        case 'm':   return "\xe1\xb8\xbf";
110        case 'n':   return "\xc3\xb1";
111        case 'o':   return "\xc3\xb6";
112        case 'p':   return "\xc3\xbe";
113        case 'q':   return "\x51";
114        case 'r':   return "\xc5\x95";
115        case 's':   return "\xc5\xa1";
116        case 't':   return "\xc5\xa3";
117        case 'u':   return "\xc3\xbb";
118        case 'v':   return "\x56";
119        case 'w':   return "\xc5\xb5";
120        case 'x':   return "\xd1\x85";
121        case 'y':   return "\xc3\xbd";
122        case 'z':   return "\xc5\xbe";
123        case 'A':   return "\xc3\x85";
124        case 'B':   return "\xce\xb2";
125        case 'C':   return "\xc3\x87";
126        case 'D':   return "\xc3\x90";
127        case 'E':   return "\xc3\x89";
128        case 'G':   return "\xc4\x9c";
129        case 'H':   return "\xc4\xa4";
130        case 'I':   return "\xc3\x8e";
131        case 'J':   return "\xc4\xb4";
132        case 'K':   return "\xc4\xb6";
133        case 'L':   return "\xc4\xbb";
134        case 'M':   return "\xe1\xb8\xbe";
135        case 'N':   return "\xc3\x91";
136        case 'O':   return "\xc3\x96";
137        case 'P':   return "\xc3\x9e";
138        case 'Q':   return "\x71";
139        case 'R':   return "\xc5\x94";
140        case 'S':   return "\xc5\xa0";
141        case 'T':   return "\xc5\xa2";
142        case 'U':   return "\xc3\x9b";
143        case 'V':   return "\xce\xbd";
144        case 'W':   return "\xc5\xb4";
145        case 'X':   return "\xc3\x97";
146        case 'Y':   return "\xc3\x9d";
147        case 'Z':   return "\xc5\xbd";
148        case '!':   return "\xc2\xa1";
149        case '?':   return "\xc2\xbf";
150        case '$':   return "\xe2\x82\xac";
151        default:    return NULL;
152    }
153}
154
155static bool is_possible_normal_placeholder_end(const char16_t c) {
156    switch (c) {
157        case 's': return true;
158        case 'S': return true;
159        case 'c': return true;
160        case 'C': return true;
161        case 'd': return true;
162        case 'o': return true;
163        case 'x': return true;
164        case 'X': return true;
165        case 'f': return true;
166        case 'e': return true;
167        case 'E': return true;
168        case 'g': return true;
169        case 'G': return true;
170        case 'a': return true;
171        case 'A': return true;
172        case 'b': return true;
173        case 'B': return true;
174        case 'h': return true;
175        case 'H': return true;
176        case '%': return true;
177        case 'n': return true;
178        default:  return false;
179    }
180}
181
182static String16 pseudo_generate_expansion(const unsigned int length) {
183    String16 result = k_expansion_string;
184    const char16_t* s = result.string();
185    if (result.size() < length) {
186        result += String16(" ");
187        result += pseudo_generate_expansion(length - result.size());
188    } else {
189        int ext = 0;
190        // Should contain only whole words, so looking for a space
191        for (unsigned int i = length + 1; i < result.size(); ++i) {
192          ++ext;
193          if (s[i] == ' ') {
194            break;
195          }
196        }
197        result.remove(length + ext, 0);
198    }
199    return result;
200}
201
202static bool is_space(const char16_t c) {
203  return (c == ' ' || c == '\t' || c == '\n');
204}
205
206String16 PseudoMethodAccent::start() {
207  String16 result;
208  if (mDepth == 0) {
209    result = String16(String8("["));
210  }
211  mWordCount = mLength = 0;
212  mDepth++;
213  return result;
214}
215
216String16 PseudoMethodAccent::end() {
217  String16 result;
218  if (mLength) {
219    result.append(String16(String8(" ")));
220    result.append(pseudo_generate_expansion(
221        mWordCount > 3 ? mLength : mLength / 2));
222  }
223  mWordCount = mLength = 0;
224  mDepth--;
225  if (mDepth == 0) {
226    result.append(String16(String8("]")));
227  }
228  return result;
229}
230
231/**
232 * Converts characters so they look like they've been localized.
233 *
234 * Note: This leaves escape sequences untouched so they can later be
235 * processed by ResTable::collectString in the normal way.
236 */
237String16 PseudoMethodAccent::text(const String16& source)
238{
239    const char16_t* s = source.string();
240    String16 result;
241    const size_t I = source.size();
242    bool lastspace = true;
243    for (size_t i=0; i<I; i++) {
244        char16_t c = s[i];
245        if (c == '\\') {
246            // Escape syntax, no need to pseudolocalize
247            if (i<I-1) {
248                result += String16("\\");
249                i++;
250                c = s[i];
251                switch (c) {
252                    case 'u':
253                        // this one takes up 5 chars
254                        result += String16(s+i, 5);
255                        i += 4;
256                        break;
257                    case 't':
258                    case 'n':
259                    case '#':
260                    case '@':
261                    case '?':
262                    case '"':
263                    case '\'':
264                    case '\\':
265                    default:
266                        result.append(&c, 1);
267                        break;
268                }
269            } else {
270                result.append(&c, 1);
271            }
272        } else if (c == '%') {
273            // Placeholder syntax, no need to pseudolocalize
274            String16 chunk;
275            bool end = false;
276            chunk.append(&c, 1);
277            while (!end && i < I) {
278                ++i;
279                c = s[i];
280                chunk.append(&c, 1);
281                if (is_possible_normal_placeholder_end(c)) {
282                    end = true;
283                } else if (c == 't') {
284                    ++i;
285                    c = s[i];
286                    chunk.append(&c, 1);
287                    end = true;
288                }
289            }
290            // Treat chunk as a placeholder unless it ends with %.
291            result += ((c == '%') ? chunk : placeholder(chunk));
292        } else if (c == '<' || c == '&') {
293            // html syntax, no need to pseudolocalize
294            bool tag_closed = false;
295            while (!tag_closed && i < I) {
296                if (c == '&') {
297                    String16 escape_text;
298                    escape_text.append(&c, 1);
299                    bool end = false;
300                    size_t htmlCodePos = i;
301                    while (!end && htmlCodePos < I) {
302                        ++htmlCodePos;
303                        c = s[htmlCodePos];
304                        escape_text.append(&c, 1);
305                        // Valid html code
306                        if (c == ';') {
307                            end = true;
308                            i = htmlCodePos;
309                        }
310                        // Wrong html code
311                        else if (!((c == '#' ||
312                                 (c >= 'a' && c <= 'z') ||
313                                 (c >= 'A' && c <= 'Z') ||
314                                 (c >= '0' && c <= '9')))) {
315                            end = true;
316                        }
317                    }
318                    result += escape_text;
319                    if (escape_text != String16("&lt;")) {
320                        tag_closed = true;
321                    }
322                    continue;
323                }
324                if (c == '>') {
325                    tag_closed = true;
326                    result.append(&c, 1);
327                    continue;
328                }
329                result.append(&c, 1);
330                i++;
331                c = s[i];
332            }
333        } else {
334            // This is a pure text that should be pseudolocalized
335            const char* p = pseudolocalize_char(c);
336            if (p != NULL) {
337                result += String16(p);
338            } else {
339                bool space = is_space(c);
340                if (lastspace && !space) {
341                  mWordCount++;
342                }
343                lastspace = space;
344                result.append(&c, 1);
345            }
346            // Count only pseudolocalizable chars and delimiters
347            mLength++;
348        }
349    }
350    return result;
351}
352String16 PseudoMethodAccent::placeholder(const String16& source) {
353  // Surround a placeholder with brackets
354  return k_placeholder_open + source + k_placeholder_close;
355}
356
357String16 PseudoMethodBidi::text(const String16& source)
358{
359    const char16_t* s = source.string();
360    String16 result;
361    bool lastspace = true;
362    bool space = true;
363    for (size_t i=0; i<source.size(); i++) {
364        char16_t c = s[i];
365        space = is_space(c);
366        if (lastspace && !space) {
367          // Word start
368          result += k_rlm + k_rlo;
369        } else if (!lastspace && space) {
370          // Word end
371          result += k_pdf + k_rlm;
372        }
373        lastspace = space;
374        result.append(&c, 1);
375    }
376    if (!lastspace) {
377      // End of last word
378      result += k_pdf + k_rlm;
379    }
380    return result;
381}
382
383String16 PseudoMethodBidi::placeholder(const String16& source) {
384  // Surround a placeholder with directionality change sequence
385  return k_rlm + k_rlo + source + k_pdf + k_rlm;
386}
387
388