1#include "pseudolocalize.h" 2 3using namespace std; 4 5// String basis to generate expansion 6static const String16 k_expansion_string = String16("one two three " 7 "four five six seven eight nine ten eleven twelve thirteen " 8 "fourteen fiveteen sixteen seventeen nineteen twenty"); 9 10// Special unicode characters to override directionality of the words 11static const String16 k_rlm = String16("\xe2\x80\x8f"); 12static const String16 k_rlo = String16("\xE2\x80\xae"); 13static const String16 k_pdf = String16("\xE2\x80\xac"); 14 15// Placeholder marks 16static const String16 k_placeholder_open = String16("\xc2\xbb"); 17static const String16 k_placeholder_close = String16("\xc2\xab"); 18 19static const char16_t k_arg_start = '{'; 20static const char16_t k_arg_end = '}'; 21 22Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m) 23 : mImpl(nullptr), mLastDepth(0) { 24 setMethod(m); 25} 26 27void Pseudolocalizer::setMethod(PseudolocalizationMethod m) { 28 if (mImpl) { 29 delete mImpl; 30 } 31 if (m == PSEUDO_ACCENTED) { 32 mImpl = new PseudoMethodAccent(); 33 } else if (m == PSEUDO_BIDI) { 34 mImpl = new PseudoMethodBidi(); 35 } else { 36 mImpl = new PseudoMethodNone(); 37 } 38} 39 40String16 Pseudolocalizer::text(const String16& text) { 41 String16 out; 42 size_t depth = mLastDepth; 43 size_t lastpos, pos; 44 const size_t length= text.size(); 45 const char16_t* str = text.string(); 46 bool escaped = false; 47 for (lastpos = pos = 0; pos < length; pos++) { 48 char16_t c = str[pos]; 49 if (escaped) { 50 escaped = false; 51 continue; 52 } 53 if (c == '\'') { 54 escaped = true; 55 continue; 56 } 57 58 if (c == k_arg_start) { 59 depth++; 60 } else if (c == k_arg_end && depth) { 61 depth--; 62 } 63 64 if (mLastDepth != depth || pos == length - 1) { 65 bool pseudo = ((mLastDepth % 2) == 0); 66 size_t nextpos = pos; 67 if (!pseudo || depth == mLastDepth) { 68 nextpos++; 69 } 70 size_t size = nextpos - lastpos; 71 if (size) { 72 String16 chunk = String16(text, size, lastpos); 73 if (pseudo) { 74 chunk = mImpl->text(chunk); 75 } else if (str[lastpos] == k_arg_start && 76 str[nextpos - 1] == k_arg_end) { 77 chunk = mImpl->placeholder(chunk); 78 } 79 out.append(chunk); 80 } 81 if (pseudo && depth < mLastDepth) { // End of message 82 out.append(mImpl->end()); 83 } else if (!pseudo && depth > mLastDepth) { // Start of message 84 out.append(mImpl->start()); 85 } 86 lastpos = nextpos; 87 mLastDepth = depth; 88 } 89 } 90 return out; 91} 92 93static const char* 94pseudolocalize_char(const char16_t c) 95{ 96 switch (c) { 97 case 'a': return "\xc3\xa5"; 98 case 'b': return "\xc9\x93"; 99 case 'c': return "\xc3\xa7"; 100 case 'd': return "\xc3\xb0"; 101 case 'e': return "\xc3\xa9"; 102 case 'f': return "\xc6\x92"; 103 case 'g': return "\xc4\x9d"; 104 case 'h': return "\xc4\xa5"; 105 case 'i': return "\xc3\xae"; 106 case 'j': return "\xc4\xb5"; 107 case 'k': return "\xc4\xb7"; 108 case 'l': return "\xc4\xbc"; 109 case 'm': return "\xe1\xb8\xbf"; 110 case 'n': return "\xc3\xb1"; 111 case 'o': return "\xc3\xb6"; 112 case 'p': return "\xc3\xbe"; 113 case 'q': return "\x51"; 114 case 'r': return "\xc5\x95"; 115 case 's': return "\xc5\xa1"; 116 case 't': return "\xc5\xa3"; 117 case 'u': return "\xc3\xbb"; 118 case 'v': return "\x56"; 119 case 'w': return "\xc5\xb5"; 120 case 'x': return "\xd1\x85"; 121 case 'y': return "\xc3\xbd"; 122 case 'z': return "\xc5\xbe"; 123 case 'A': return "\xc3\x85"; 124 case 'B': return "\xce\xb2"; 125 case 'C': return "\xc3\x87"; 126 case 'D': return "\xc3\x90"; 127 case 'E': return "\xc3\x89"; 128 case 'G': return "\xc4\x9c"; 129 case 'H': return "\xc4\xa4"; 130 case 'I': return "\xc3\x8e"; 131 case 'J': return "\xc4\xb4"; 132 case 'K': return "\xc4\xb6"; 133 case 'L': return "\xc4\xbb"; 134 case 'M': return "\xe1\xb8\xbe"; 135 case 'N': return "\xc3\x91"; 136 case 'O': return "\xc3\x96"; 137 case 'P': return "\xc3\x9e"; 138 case 'Q': return "\x71"; 139 case 'R': return "\xc5\x94"; 140 case 'S': return "\xc5\xa0"; 141 case 'T': return "\xc5\xa2"; 142 case 'U': return "\xc3\x9b"; 143 case 'V': return "\xce\xbd"; 144 case 'W': return "\xc5\xb4"; 145 case 'X': return "\xc3\x97"; 146 case 'Y': return "\xc3\x9d"; 147 case 'Z': return "\xc5\xbd"; 148 case '!': return "\xc2\xa1"; 149 case '?': return "\xc2\xbf"; 150 case '$': return "\xe2\x82\xac"; 151 default: return NULL; 152 } 153} 154 155static bool is_possible_normal_placeholder_end(const char16_t c) { 156 switch (c) { 157 case 's': return true; 158 case 'S': return true; 159 case 'c': return true; 160 case 'C': return true; 161 case 'd': return true; 162 case 'o': return true; 163 case 'x': return true; 164 case 'X': return true; 165 case 'f': return true; 166 case 'e': return true; 167 case 'E': return true; 168 case 'g': return true; 169 case 'G': return true; 170 case 'a': return true; 171 case 'A': return true; 172 case 'b': return true; 173 case 'B': return true; 174 case 'h': return true; 175 case 'H': return true; 176 case '%': return true; 177 case 'n': return true; 178 default: return false; 179 } 180} 181 182static String16 pseudo_generate_expansion(const unsigned int length) { 183 String16 result = k_expansion_string; 184 const char16_t* s = result.string(); 185 if (result.size() < length) { 186 result += String16(" "); 187 result += pseudo_generate_expansion(length - result.size()); 188 } else { 189 int ext = 0; 190 // Should contain only whole words, so looking for a space 191 for (unsigned int i = length + 1; i < result.size(); ++i) { 192 ++ext; 193 if (s[i] == ' ') { 194 break; 195 } 196 } 197 result.remove(length + ext, 0); 198 } 199 return result; 200} 201 202static bool is_space(const char16_t c) { 203 return (c == ' ' || c == '\t' || c == '\n'); 204} 205 206String16 PseudoMethodAccent::start() { 207 String16 result; 208 if (mDepth == 0) { 209 result = String16(String8("[")); 210 } 211 mWordCount = mLength = 0; 212 mDepth++; 213 return result; 214} 215 216String16 PseudoMethodAccent::end() { 217 String16 result; 218 if (mLength) { 219 result.append(String16(String8(" "))); 220 result.append(pseudo_generate_expansion( 221 mWordCount > 3 ? mLength : mLength / 2)); 222 } 223 mWordCount = mLength = 0; 224 mDepth--; 225 if (mDepth == 0) { 226 result.append(String16(String8("]"))); 227 } 228 return result; 229} 230 231/** 232 * Converts characters so they look like they've been localized. 233 * 234 * Note: This leaves escape sequences untouched so they can later be 235 * processed by ResTable::collectString in the normal way. 236 */ 237String16 PseudoMethodAccent::text(const String16& source) 238{ 239 const char16_t* s = source.string(); 240 String16 result; 241 const size_t I = source.size(); 242 bool lastspace = true; 243 for (size_t i=0; i<I; i++) { 244 char16_t c = s[i]; 245 if (c == '\\') { 246 // Escape syntax, no need to pseudolocalize 247 if (i<I-1) { 248 result += String16("\\"); 249 i++; 250 c = s[i]; 251 switch (c) { 252 case 'u': 253 // this one takes up 5 chars 254 result += String16(s+i, 5); 255 i += 4; 256 break; 257 case 't': 258 case 'n': 259 case '#': 260 case '@': 261 case '?': 262 case '"': 263 case '\'': 264 case '\\': 265 default: 266 result.append(&c, 1); 267 break; 268 } 269 } else { 270 result.append(&c, 1); 271 } 272 } else if (c == '%') { 273 // Placeholder syntax, no need to pseudolocalize 274 String16 chunk; 275 bool end = false; 276 chunk.append(&c, 1); 277 while (!end && i < I) { 278 ++i; 279 c = s[i]; 280 chunk.append(&c, 1); 281 if (is_possible_normal_placeholder_end(c)) { 282 end = true; 283 } else if (c == 't') { 284 ++i; 285 c = s[i]; 286 chunk.append(&c, 1); 287 end = true; 288 } 289 } 290 // Treat chunk as a placeholder unless it ends with %. 291 result += ((c == '%') ? chunk : placeholder(chunk)); 292 } else if (c == '<' || c == '&') { 293 // html syntax, no need to pseudolocalize 294 bool tag_closed = false; 295 while (!tag_closed && i < I) { 296 if (c == '&') { 297 String16 escape_text; 298 escape_text.append(&c, 1); 299 bool end = false; 300 size_t htmlCodePos = i; 301 while (!end && htmlCodePos < I) { 302 ++htmlCodePos; 303 c = s[htmlCodePos]; 304 escape_text.append(&c, 1); 305 // Valid html code 306 if (c == ';') { 307 end = true; 308 i = htmlCodePos; 309 } 310 // Wrong html code 311 else if (!((c == '#' || 312 (c >= 'a' && c <= 'z') || 313 (c >= 'A' && c <= 'Z') || 314 (c >= '0' && c <= '9')))) { 315 end = true; 316 } 317 } 318 result += escape_text; 319 if (escape_text != String16("<")) { 320 tag_closed = true; 321 } 322 continue; 323 } 324 if (c == '>') { 325 tag_closed = true; 326 result.append(&c, 1); 327 continue; 328 } 329 result.append(&c, 1); 330 i++; 331 c = s[i]; 332 } 333 } else { 334 // This is a pure text that should be pseudolocalized 335 const char* p = pseudolocalize_char(c); 336 if (p != NULL) { 337 result += String16(p); 338 } else { 339 bool space = is_space(c); 340 if (lastspace && !space) { 341 mWordCount++; 342 } 343 lastspace = space; 344 result.append(&c, 1); 345 } 346 // Count only pseudolocalizable chars and delimiters 347 mLength++; 348 } 349 } 350 return result; 351} 352String16 PseudoMethodAccent::placeholder(const String16& source) { 353 // Surround a placeholder with brackets 354 return k_placeholder_open + source + k_placeholder_close; 355} 356 357String16 PseudoMethodBidi::text(const String16& source) 358{ 359 const char16_t* s = source.string(); 360 String16 result; 361 bool lastspace = true; 362 bool space = true; 363 for (size_t i=0; i<source.size(); i++) { 364 char16_t c = s[i]; 365 space = is_space(c); 366 if (lastspace && !space) { 367 // Word start 368 result += k_rlm + k_rlo; 369 } else if (!lastspace && space) { 370 // Word end 371 result += k_pdf + k_rlm; 372 } 373 lastspace = space; 374 result.append(&c, 1); 375 } 376 if (!lastspace) { 377 // End of last word 378 result += k_pdf + k_rlm; 379 } 380 return result; 381} 382 383String16 PseudoMethodBidi::placeholder(const String16& source) { 384 // Surround a placeholder with directionality change sequence 385 return k_rlm + k_rlo + source + k_pdf + k_rlm; 386} 387 388