15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/getonescriptspan.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <stdio.h> 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/lang_enc.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8propjustletter.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8propletterscriptnum.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_basictypes.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_commandlineflags.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_google.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_htmlutils.h" 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_unilib.h" 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf8statetable.h" 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf8utils.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const Language GRAY_LANG = (Language)254; 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxUpToWordBoundary = 50; // span < this make longer, 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // else make shorter 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to round to word boundary, 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // direction above 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const char kSpecialSymbol[256] = { // true for < > & 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define LT 0 // < 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GT 1 // > 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define EX 2 // ! 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define HY 3 // - 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define QU 4 // " 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define AP 5 // ' 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define SL 6 // / 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define S_ 7 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define C_ 8 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define R_ 9 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define I_ 10 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define P_ 11 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define T_ 12 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define Y_ 13 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define L_ 14 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define E_ 15 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CR 16 // <cr> or <lf> 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NL 17 // non-letter: ASCII whitespace, digit, punctuation 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define PL 18 // possible letter, incl. & 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define xx 19 // <unused> 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Map byte to one of ~20 interesting categories for cheap tag parsing 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint8 kCharToSub[256] = { 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef LT 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef GT 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef EX 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef HY 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef QU 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef AP 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef SL 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef S_ 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef C_ 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef R_ 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef I_ 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef P_ 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef T_ 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef Y_ 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef L_ 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef E_ 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef CR 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef NL 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef PL 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef xx 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define OK 0 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define X_ 1 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// State machine to do cheap parse of non-letter strings incl. tags 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag> 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// | | 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag> ... </tag> for <script> <style> 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// | | 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <!-- ... <tag> ... --> 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// | | 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// || (0) 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag <tag2> 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// || (0) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint8 kTagParseTbl_0[] = { 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// < > ! - " ' / S C R I P T Y L E CR NL PL xx 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// < > ! - " ' / S C R I P T Y L E CR NL PL xx 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</ 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// < > ! - " ' / S C R I P T Y L E CR NL PL xx 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</ 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef OK 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef X_ 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Convert GetTimeOfDay output to 64-bit usec 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static inline uint64 Microseconds(const struct timeval& t) { 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The SumReducer uses uint64, so convert to (uint64) microseconds, 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // not (double) seconds. 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return t.tv_sec * 1000000ULL + t.tv_usec; 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/ 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true if character is < > or & 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool inline IsSpecial(char c) { 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((c & 0xe0) == 0x20) { 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return kSpecialSymbol[static_cast<uint8>(c)]; 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Quick Skip to next letter or < > & or to end of string (eos) 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Always return is_letter for eos 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int ScanToLetterOrSpecial(const char* src, int len) { 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int bytes_consumed; 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len, 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &bytes_consumed); 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return bytes_consumed; 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// src points to non-letter, such as tag-opening '<' 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return length from here to next possible letter 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// On eos or another < before >, return 1 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag> 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// | | 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag> ... </tag> for <script> <style> 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// | | 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <!-- ... <tag> ... --> 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// | | 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// || (1) 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag <tag2> 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// || (1) 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int ScanToPossibleLetter(const char* isrc, int len) { 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* src = reinterpret_cast<const uint8*>(isrc); 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* srclimit = src + len; 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* tagParseTbl = kTagParseTbl_0; 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int e = 0; 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (src < srclimit) { 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) e = tagParseTbl[kCharToSub[*src++]]; 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((e & ~1) == 0) { 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We overshot by one byte 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --src; 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tagParseTbl = &kTagParseTbl_0[e * 20]; 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src >= srclimit) { 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We fell off the end of the text. 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // It looks like the most common case for this is a truncated file, not 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // mismatched angle brackets. So we pretend that the last char was '>' 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return len; 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // OK to be in state 0 or state 2 at exit 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((e != 0) && (e != 2)) { 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Error, '<' followed by '<' 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We want to back up to first <, then advance by one byte past it 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int offset = src - reinterpret_cast<const uint8*>(isrc); 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc); 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Backscan to first '<' and return enough length to just get past it 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --offset; // back up over the second '<', which caused us to stop 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while ((0 < offset) && (isrc[offset] != '<')) { 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Find the first '<', which is unmatched 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --offset; 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // skip to just beyond first '<' 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf(" returning %d\n", offset + 1); 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return offset + 1; 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return src - reinterpret_cast<const uint8*>(isrc); 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScriptScanner::ScriptScanner(const char* buffer, 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int buffer_length, 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_plain_text) 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : start_byte_(buffer), 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next_byte_(buffer), 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next_byte_limit_(buffer + buffer_length), 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) byte_length_(buffer_length), 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) is_plain_text_(is_plain_text) { 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_ = new char[getone::kMaxScriptBuffer]; 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer]; 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScriptScanner::~ScriptScanner() { 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] script_buffer_; 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] script_buffer_lower_; 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Get to the first real non-tag letter or entity that is a letter 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets script of that letter 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return len if no more letters 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int sc = UNKNOWN_LSCRIPT; 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int skip = 0; 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int tlen, plen; 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Do run of non-letters (tag | &NL | NL)* 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (skip < len) { 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Do fast scan to next interesting byte 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // int oldskip = skip; 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) skip += ScanToLetterOrSpecial(src + skip, len - skip); 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TEMP 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n", 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // oldskip, src[oldskip], skip, src[skip]); 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check for no more letters/specials 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (skip >= len) { 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // All done 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return len; 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We are at a letter, nonletter, tag, or entity 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsSpecial(src[skip]) && !is_plain_text_) { 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src[skip] == '<') { 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begining of tag; skip to end and go around again 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = ScanToPossibleLetter(src + skip, len - skip); 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = 0; 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("<...> "); 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (src[skip] == '>') { 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Unexpected end of tag; skip it and go around again 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = 1; // Over the > 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = 0; 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("..> "); 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (src[skip] == '&') { 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Expand entity, no advance 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char temp[4]; 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EntityToBuffer(src + skip, len - skip, 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) temp, &tlen, &plen); 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = getone::GetUTF8LetterScriptNum(temp); 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc); 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Update 1..4 bytes 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = cld_UniLib::OneCharLen(src + skip); 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = getone::GetUTF8LetterScriptNum(src + skip); 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc); 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TEMP 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("sc=%d ", sc); 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sc != 0) {break;} // Letter found 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) skip += tlen; // Advance 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *script = sc; 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return skip; 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifdef NEED_ALIGNED_LOADS 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const bool kNeedsAlignedLoads = true; 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const bool kNeedsAlignedLoads = false; 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copy next run of same-script non-tag letters to buffer [NUL terminated] 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Buffer has leading space and all text is lowercased 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) { 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->text = script_buffer_; 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->text_bytes = 0; 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->offset = next_byte_ - start_byte_; 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->script = UNKNOWN_LSCRIPT; 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->lang = UNKNOWN_LANGUAGE; 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->truncated = false; 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("GetOneScriptSpan[[ "); 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // struct timeval script_start, script_mid, script_end; 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int spanscript; // The script of this span 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int sc = UNKNOWN_LSCRIPT; // The script of next character 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int tlen, plen; 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[0] = ' '; // Always a space at front of output 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[1] = '\0'; 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int take = 0; 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int put = 1; // Start after the initial space 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // gettimeofday(&script_start, NULL); 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Get to the first real non-tag letter or entity that is a letter 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next_byte_ += skip; 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) byte_length_ -= skip; 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (byte_length_ <= 0) { 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("]]\n"); 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; // No more letters to be found 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // gettimeofday(&script_mid, NULL); 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // There is at least one letter, so we know the script for this span 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("{%d} ", spanscript); 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->script = (UnicodeLScript)spanscript; 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Go over alternating spans of same-script letters and non-letters, 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // copying letters to buffer with single spaces for each run of non-letters 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (take < byte_length_) { 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Copy run of letters in same script (&LS | LS)* 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int letter_count = 0; // Keep track of word length 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool need_break = false; 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (take < byte_length_) { 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We are at a letter, nonletter, tag, or entity 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("\"%c\" ", next_byte_[take]); 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next_byte_[take] == '<') { 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begining of tag 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = 0; 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (next_byte_[take] == '>') { 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Unexpected end of tag 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = 0; 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (next_byte_[take] == '&') { 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Copy entity, no advance 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EntityToBuffer(next_byte_ + take, byte_length_ - take, 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_ + put, &tlen, &plen); 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Real letter, safely copy up to 4 bytes, increment by 1..4 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Will update by 1..4 bytes at Advance, below 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take); 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) { 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Fast case 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *reinterpret_cast<uint32*>(script_buffer_ + put) = 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *reinterpret_cast<const uint32*>(next_byte_ + take); 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Slow case, happens 1-3 times per input document 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memcpy(script_buffer_ + put, next_byte_ + take, plen); 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("sc(%c)=%d ", next_byte_[take], sc); 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen); 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc); 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Allow continue across a single letter in a different script: 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A B D = three scripts, c = common script, i = inherited script, 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // - = don't care, ( = take position before the += below 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(A- continue 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(BA continue 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(BB break 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(Bc continue (breaks after B) 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(BD break 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(Bi break 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(c- break 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AAA(i- continue 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((sc != spanscript) && (sc != ULScript_Inherited)) { 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Might need to break this script span 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sc == ULScript_Common) { 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) need_break = true; 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look at next following character, ignoring entity as Common 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen); 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) need_break = true; 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (need_break) {break;} // Non-letter or letter in wrong script 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) take += tlen; // Advance 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) put += plen; // Advance 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++letter_count; 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (put >= getone::kMaxScriptBytes) { 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Buffer is full 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->truncated = true; 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } // End while letters 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Do run of non-letters (tag | &NL | NL)* 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (take < byte_length_) { 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Do fast scan to next interesting byte 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check for no more letters/specials 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (take >= byte_length_) { 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) take = byte_length_; 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We are at a letter, nonletter, tag, or entity 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsSpecial(next_byte_[take]) && !is_plain_text_) { 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("\"%c\" ", next_byte_[take]); 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next_byte_[take] == '<') { 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begining of tag; skip to end and go around again 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take); 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = 0; 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("<...> "); 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (next_byte_[take] == '>') { 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Unexpected end of tag; skip it and go around again 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = 1; // Over the > 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = 0; 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("..> "); 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (next_byte_[take] == '&') { 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Expand entity, no advance 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EntityToBuffer(next_byte_ + take, byte_length_ - take, 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_ + put, &tlen, &plen); 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); 5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Update 1..4 5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tlen = cld_UniLib::OneCharLen(next_byte_ + take); 5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); 5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf("sc[%c]=%d ", next_byte_[take], sc); 5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sc != 0) {break;} // Letter found 5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) take += tlen; // Advance 5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } // End while not-letters 5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[put++] = ' '; 5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We are at a letter again (or eos), after letter* not-letter* 5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sc != spanscript) {break;} // Letter in wrong script 5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (put >= getone::kMaxScriptBytes - 8) { 5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Buffer is almost full 5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->truncated = true; 5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Update input position 5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next_byte_ += take; 5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) byte_length_ -= take; 5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kMaxScriptBytes | | put 5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[put + 0] = ' '; 5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[put + 1] = ' '; 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[put + 2] = ' '; 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_[put + 3] = '\0'; 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->text_bytes = put; // Does not include the last four chars above 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // printf(" %d]]\n\n", put); 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Force Latin, Cyrillic, Greek scripts to be lowercase 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) { 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // On Windows, text is lowercased beforehand, so no need to do anything here. 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if !defined(CLD_WINDOWS) 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If needed, lowercase all the text. If we do it sooner, might miss 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // lowercasing an entity such as Á 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We only need to do this for Latn and Cyrl scripts 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((span->script == ULScript_Latin) || 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (span->script == ULScript_Cyrillic) || 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (span->script == ULScript_Greek)) { 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Full Unicode lowercase of the entire buffer, including 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // four pad bytes off the end 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int consumed, filled; 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UniLib::ToLower(span->text, span->text_bytes + 4, 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) script_buffer_lower_, getone::kMaxScriptLowerBuffer, 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &consumed, &filled); 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->text = script_buffer_lower_; 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) span->text_bytes = filled - 4; 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copy next run of same-script non-tag letters to buffer [NUL terminated] 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Force Latin and Cyrillic scripts to be lowercase 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) { 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ok = GetOneScriptSpan(span); 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LowerScriptSpan(span); 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ok; 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Gets lscript number for letters; always returns 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 0 (common script) for non-letters 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int getone::GetUTF8LetterScriptNum(const char* src) { 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int srclen = cld_UniLib::OneCharLen(src); 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* usrc = reinterpret_cast<const uint8*>(src); 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen); 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 577