1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "encodings/compact_lang_det/getonescriptspan.h" 6#include <stdio.h> 7#include <string.h> 8 9#include "base/basictypes.h" 10#include "encodings/lang_enc.h" 11#include "encodings/compact_lang_det/utf8propjustletter.h" 12#include "encodings/compact_lang_det/utf8propletterscriptnum.h" 13#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h" 14 15#include "encodings/compact_lang_det/win/cld_basictypes.h" 16#include "encodings/compact_lang_det/win/cld_commandlineflags.h" 17#include "encodings/compact_lang_det/win/cld_google.h" 18#include "encodings/compact_lang_det/win/cld_htmlutils.h" 19#include "encodings/compact_lang_det/win/cld_unilib.h" 20#include "encodings/compact_lang_det/win/cld_utf8statetable.h" 21#include "encodings/compact_lang_det/win/cld_utf8utils.h" 22 23static const Language GRAY_LANG = (Language)254; 24 25static const int kMaxUpToWordBoundary = 50; // span < this make longer, 26 // else make shorter 27static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes 28 // to round to word boundary, 29 // direction above 30 31static const char kSpecialSymbol[256] = { // true for < > & 32 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 33 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, 34 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 35 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 36 37 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 38 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 39 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 40 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 41}; 42 43 44 45#define LT 0 // < 46#define GT 1 // > 47#define EX 2 // ! 48#define HY 3 // - 49#define QU 4 // " 50#define AP 5 // ' 51#define SL 6 // / 52#define S_ 7 53#define C_ 8 54#define R_ 9 55#define I_ 10 56#define P_ 11 57#define T_ 12 58#define Y_ 13 59#define L_ 14 60#define E_ 15 61#define CR 16 // <cr> or <lf> 62#define NL 17 // non-letter: ASCII whitespace, digit, punctuation 63#define PL 18 // possible letter, incl. & 64#define xx 19 // <unused> 65 66// Map byte to one of ~20 interesting categories for cheap tag parsing 67static const uint8 kCharToSub[256] = { 68 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, 69 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 70 NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, 71 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, 72 73 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 74 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 75 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 76 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 77 78 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 79 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 80 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 81 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 82 83 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 84 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 85 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 86 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 87}; 88 89#undef LT 90#undef GT 91#undef EX 92#undef HY 93#undef QU 94#undef AP 95#undef SL 96#undef S_ 97#undef C_ 98#undef R_ 99#undef I_ 100#undef P_ 101#undef T_ 102#undef Y_ 103#undef L_ 104#undef E_ 105#undef CR 106#undef NL 107#undef PL 108#undef xx 109 110 111#define OK 0 112#define X_ 1 113 114// State machine to do cheap parse of non-letter strings incl. tags 115// advances <tag> 116// | | 117// advances <tag> ... </tag> for <script> <style> 118// | | 119// advances <!-- ... <tag> ... --> 120// | | 121// advances <tag 122// || (0) 123// advances <tag <tag2> 124// || (0) 125static const uint8 kTagParseTbl_0[] = { 126// < > ! - " ' / S C R I P T Y L E CR NL PL xx 127 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK 128 X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error 129 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* 130 X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < 131 X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! 132 X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- 133 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* 134 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- 135 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- 136 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* 137 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" 138 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' 139 X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' 140 141// < > ! - " ' / S C R I P T Y L E CR NL PL xx 142 X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S 143 X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC 144 X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR 145 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI 146 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP 147 X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT 148 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* 149 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< 150 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</ 151 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S 152 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC 153 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR 154 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI 155 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP 156 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT 157 158// < > ! - " ' / S C R I P T Y L E CR NL PL xx 159 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST 160 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY 161 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL 162 X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE 163 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* 164 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< 165 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</ 166 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S 167 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST 168 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY 169 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL 170 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE 171}; 172 173#undef OK 174#undef X_ 175 176 177/* 178// Convert GetTimeOfDay output to 64-bit usec 179static inline uint64 Microseconds(const struct timeval& t) { 180 // The SumReducer uses uint64, so convert to (uint64) microseconds, 181 // not (double) seconds. 182 return t.tv_sec * 1000000ULL + t.tv_usec; 183} 184*/ 185 186 187// Returns true if character is < > or & 188bool inline IsSpecial(char c) { 189 if ((c & 0xe0) == 0x20) { 190 return kSpecialSymbol[static_cast<uint8>(c)]; 191 } 192 return false; 193} 194 195// Quick Skip to next letter or < > & or to end of string (eos) 196// Always return is_letter for eos 197int ScanToLetterOrSpecial(const char* src, int len) { 198 int bytes_consumed; 199 cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len, 200 &bytes_consumed); 201 return bytes_consumed; 202} 203 204 205 206// src points to non-letter, such as tag-opening '<' 207// Return length from here to next possible letter 208// On eos or another < before >, return 1 209// advances <tag> 210// | | 211// advances <tag> ... </tag> for <script> <style> 212// | | 213// advances <!-- ... <tag> ... --> 214// | | 215// advances <tag 216// || (1) 217// advances <tag <tag2> 218// || (1) 219int ScanToPossibleLetter(const char* isrc, int len) { 220 const uint8* src = reinterpret_cast<const uint8*>(isrc); 221 const uint8* srclimit = src + len; 222 const uint8* tagParseTbl = kTagParseTbl_0; 223 int e = 0; 224 while (src < srclimit) { 225 e = tagParseTbl[kCharToSub[*src++]]; 226 if ((e & ~1) == 0) { 227 // We overshot by one byte 228 --src; 229 break; 230 } 231 tagParseTbl = &kTagParseTbl_0[e * 20]; 232 } 233 234 if (src >= srclimit) { 235 // We fell off the end of the text. 236 // It looks like the most common case for this is a truncated file, not 237 // mismatched angle brackets. So we pretend that the last char was '>' 238 return len; 239 } 240 241 // OK to be in state 0 or state 2 at exit 242 if ((e != 0) && (e != 2)) { 243 // Error, '<' followed by '<' 244 // We want to back up to first <, then advance by one byte past it 245 int offset = src - reinterpret_cast<const uint8*>(isrc); 246 // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc); 247 248 // Backscan to first '<' and return enough length to just get past it 249 --offset; // back up over the second '<', which caused us to stop 250 while ((0 < offset) && (isrc[offset] != '<')) { 251 // Find the first '<', which is unmatched 252 --offset; 253 } 254 // skip to just beyond first '<' 255 // printf(" returning %d\n", offset + 1); 256 return offset + 1; 257 } 258 259 return src - reinterpret_cast<const uint8*>(isrc); 260} 261 262 263 264ScriptScanner::ScriptScanner(const char* buffer, 265 int buffer_length, 266 bool is_plain_text) 267 : start_byte_(buffer), 268 next_byte_(buffer), 269 next_byte_limit_(buffer + buffer_length), 270 byte_length_(buffer_length), 271 is_plain_text_(is_plain_text) { 272 script_buffer_ = new char[getone::kMaxScriptBuffer]; 273 script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer]; 274} 275 276ScriptScanner::~ScriptScanner() { 277 delete[] script_buffer_; 278 delete[] script_buffer_lower_; 279} 280 281 282 283 284// Get to the first real non-tag letter or entity that is a letter 285// Sets script of that letter 286// Return len if no more letters 287int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { 288 int sc = UNKNOWN_LSCRIPT; 289 int skip = 0; 290 int tlen, plen; 291 292 // Do run of non-letters (tag | &NL | NL)* 293 while (skip < len) { 294 // Do fast scan to next interesting byte 295 // int oldskip = skip; 296 skip += ScanToLetterOrSpecial(src + skip, len - skip); 297 // TEMP 298 // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n", 299 // oldskip, src[oldskip], skip, src[skip]); 300 301 // Check for no more letters/specials 302 if (skip >= len) { 303 // All done 304 return len; 305 } 306 307 // We are at a letter, nonletter, tag, or entity 308 if (IsSpecial(src[skip]) && !is_plain_text_) { 309 if (src[skip] == '<') { 310 // Begining of tag; skip to end and go around again 311 tlen = ScanToPossibleLetter(src + skip, len - skip); 312 sc = 0; 313 // printf("<...> "); 314 } else if (src[skip] == '>') { 315 // Unexpected end of tag; skip it and go around again 316 tlen = 1; // Over the > 317 sc = 0; 318 // printf("..> "); 319 } else if (src[skip] == '&') { 320 // Expand entity, no advance 321 char temp[4]; 322 EntityToBuffer(src + skip, len - skip, 323 temp, &tlen, &plen); 324 sc = getone::GetUTF8LetterScriptNum(temp); 325 // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc); 326 } 327 } else { 328 // Update 1..4 bytes 329 tlen = cld_UniLib::OneCharLen(src + skip); 330 sc = getone::GetUTF8LetterScriptNum(src + skip); 331 // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc); 332 } 333 // TEMP 334 // printf("sc=%d ", sc); 335 if (sc != 0) {break;} // Letter found 336 skip += tlen; // Advance 337 } 338 339 *script = sc; 340 return skip; 341} 342 343#ifdef NEED_ALIGNED_LOADS 344static const bool kNeedsAlignedLoads = true; 345#else 346static const bool kNeedsAlignedLoads = false; 347#endif 348 349 350// Copy next run of same-script non-tag letters to buffer [NUL terminated] 351// Buffer has leading space and all text is lowercased 352bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) { 353 span->text = script_buffer_; 354 span->text_bytes = 0; 355 span->offset = next_byte_ - start_byte_; 356 span->script = UNKNOWN_LSCRIPT; 357 span->lang = UNKNOWN_LANGUAGE; 358 span->truncated = false; 359 360 // printf("GetOneScriptSpan[[ "); 361 // struct timeval script_start, script_mid, script_end; 362 363 int spanscript; // The script of this span 364 int sc = UNKNOWN_LSCRIPT; // The script of next character 365 int tlen, plen; 366 367 368 script_buffer_[0] = ' '; // Always a space at front of output 369 script_buffer_[1] = '\0'; 370 int take = 0; 371 int put = 1; // Start after the initial space 372 373 // gettimeofday(&script_start, NULL); 374 // Get to the first real non-tag letter or entity that is a letter 375 int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); 376 next_byte_ += skip; 377 byte_length_ -= skip; 378 if (byte_length_ <= 0) { 379 // printf("]]\n"); 380 return false; // No more letters to be found 381 } 382 383 // gettimeofday(&script_mid, NULL); 384 385 // There is at least one letter, so we know the script for this span 386 // printf("{%d} ", spanscript); 387 span->script = (UnicodeLScript)spanscript; 388 389 390 // Go over alternating spans of same-script letters and non-letters, 391 // copying letters to buffer with single spaces for each run of non-letters 392 while (take < byte_length_) { 393 // Copy run of letters in same script (&LS | LS)* 394 int letter_count = 0; // Keep track of word length 395 bool need_break = false; 396 while (take < byte_length_) { 397 // We are at a letter, nonletter, tag, or entity 398 if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 399 // printf("\"%c\" ", next_byte_[take]); 400 if (next_byte_[take] == '<') { 401 // Begining of tag 402 sc = 0; 403 break; 404 } else if (next_byte_[take] == '>') { 405 // Unexpected end of tag 406 sc = 0; 407 break; 408 } else if (next_byte_[take] == '&') { 409 // Copy entity, no advance 410 EntityToBuffer(next_byte_ + take, byte_length_ - take, 411 script_buffer_ + put, &tlen, &plen); 412 sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); 413 } 414 } else { 415 // Real letter, safely copy up to 4 bytes, increment by 1..4 416 // Will update by 1..4 bytes at Advance, below 417 tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take); 418 if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) { 419 // Fast case 420 *reinterpret_cast<uint32*>(script_buffer_ + put) = 421 *reinterpret_cast<const uint32*>(next_byte_ + take); 422 } else { 423 // Slow case, happens 1-3 times per input document 424 memcpy(script_buffer_ + put, next_byte_ + take, plen); 425 } 426 sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); 427 } 428 // printf("sc(%c)=%d ", next_byte_[take], sc); 429 // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen); 430 // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc); 431 432 // Allow continue across a single letter in a different script: 433 // A B D = three scripts, c = common script, i = inherited script, 434 // - = don't care, ( = take position before the += below 435 // AAA(A- continue 436 // 437 // AAA(BA continue 438 // AAA(BB break 439 // AAA(Bc continue (breaks after B) 440 // AAA(BD break 441 // AAA(Bi break 442 // 443 // AAA(c- break 444 // 445 // AAA(i- continue 446 // 447 448 if ((sc != spanscript) && (sc != ULScript_Inherited)) { 449 // Might need to break this script span 450 if (sc == ULScript_Common) { 451 need_break = true; 452 } else { 453 // Look at next following character, ignoring entity as Common 454 int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen); 455 if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { 456 need_break = true; 457 } 458 } 459 } 460 if (need_break) {break;} // Non-letter or letter in wrong script 461 462 take += tlen; // Advance 463 put += plen; // Advance 464 ++letter_count; 465 if (put >= getone::kMaxScriptBytes) { 466 // Buffer is full 467 span->truncated = true; 468 break; 469 } 470 } // End while letters 471 472 // Do run of non-letters (tag | &NL | NL)* 473 while (take < byte_length_) { 474 // Do fast scan to next interesting byte 475 take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); 476 477 // Check for no more letters/specials 478 if (take >= byte_length_) { 479 take = byte_length_; 480 break; 481 } 482 483 // We are at a letter, nonletter, tag, or entity 484 if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 485 // printf("\"%c\" ", next_byte_[take]); 486 if (next_byte_[take] == '<') { 487 // Begining of tag; skip to end and go around again 488 tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take); 489 sc = 0; 490 // printf("<...> "); 491 } else if (next_byte_[take] == '>') { 492 // Unexpected end of tag; skip it and go around again 493 tlen = 1; // Over the > 494 sc = 0; 495 // printf("..> "); 496 } else if (next_byte_[take] == '&') { 497 // Expand entity, no advance 498 EntityToBuffer(next_byte_ + take, byte_length_ - take, 499 script_buffer_ + put, &tlen, &plen); 500 sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); 501 } 502 } else { 503 // Update 1..4 504 tlen = cld_UniLib::OneCharLen(next_byte_ + take); 505 sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); 506 } 507 // printf("sc[%c]=%d ", next_byte_[take], sc); 508 if (sc != 0) {break;} // Letter found 509 take += tlen; // Advance 510 } // End while not-letters 511 512 script_buffer_[put++] = ' '; 513 514 // We are at a letter again (or eos), after letter* not-letter* 515 if (sc != spanscript) {break;} // Letter in wrong script 516 if (put >= getone::kMaxScriptBytes - 8) { 517 // Buffer is almost full 518 span->truncated = true; 519 break; 520 } 521 } 522 523 // Update input position 524 next_byte_ += take; 525 byte_length_ -= take; 526 527 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 528 // kMaxScriptBytes | | put 529 script_buffer_[put + 0] = ' '; 530 script_buffer_[put + 1] = ' '; 531 script_buffer_[put + 2] = ' '; 532 script_buffer_[put + 3] = '\0'; 533 534 span->text_bytes = put; // Does not include the last four chars above 535 536 // printf(" %d]]\n\n", put); 537 return true; 538} 539 540// Force Latin, Cyrillic, Greek scripts to be lowercase 541void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) { 542 // On Windows, text is lowercased beforehand, so no need to do anything here. 543#if !defined(CLD_WINDOWS) 544 // If needed, lowercase all the text. If we do it sooner, might miss 545 // lowercasing an entity such as Á 546 // We only need to do this for Latn and Cyrl scripts 547 if ((span->script == ULScript_Latin) || 548 (span->script == ULScript_Cyrillic) || 549 (span->script == ULScript_Greek)) { 550 // Full Unicode lowercase of the entire buffer, including 551 // four pad bytes off the end 552 int consumed, filled; 553 UniLib::ToLower(span->text, span->text_bytes + 4, 554 script_buffer_lower_, getone::kMaxScriptLowerBuffer, 555 &consumed, &filled); 556 span->text = script_buffer_lower_; 557 span->text_bytes = filled - 4; 558 } 559#endif 560} 561 562// Copy next run of same-script non-tag letters to buffer [NUL terminated] 563// Force Latin and Cyrillic scripts to be lowercase 564bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) { 565 bool ok = GetOneScriptSpan(span); 566 LowerScriptSpan(span); 567 return ok; 568} 569 570// Gets lscript number for letters; always returns 571// 0 (common script) for non-letters 572int getone::GetUTF8LetterScriptNum(const char* src) { 573 int srclen = cld_UniLib::OneCharLen(src); 574 const uint8* usrc = reinterpret_cast<const uint8*>(src); 575 return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen); 576} 577