15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/getonescriptspan.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <stdio.h>
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/lang_enc.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8propjustletter.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_basictypes.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_google.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_htmlutils.h"
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_unilib.h"
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf8utils.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const Language GRAY_LANG = (Language)254;
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                  // else make shorter
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                  // to round to word boundary,
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                  // direction above
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const char kSpecialSymbol[256] = {       // true for < > &
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define LT 0      // <
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GT 1      // >
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define EX 2      // !
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define HY 3      // -
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define QU 4      // "
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define AP 5      // '
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define SL 6      // /
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define S_ 7
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define C_ 8
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define R_ 9
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define I_ 10
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define P_ 11
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define T_ 12
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define Y_ 13
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define L_ 14
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define E_ 15
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CR 16     // <cr> or <lf>
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define PL 18     // possible letter, incl. &
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define xx 19     // <unused>
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Map byte to one of ~20 interesting categories for cheap tag parsing
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint8 kCharToSub[256] = {
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef LT
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef GT
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef EX
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef HY
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef QU
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef AP
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef SL
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef S_
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef C_
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef R_
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef I_
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef P_
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef T_
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef Y_
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef L_
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef E_
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef CR
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef NL
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef PL
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef xx
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define OK 0
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define X_ 1
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// State machine to do cheap parse of non-letter strings incl. tags
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag>
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          |    |
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag> ... </tag>  for <script> <style>
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          |               |
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <!-- ... <tag> ... -->
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          |                     |
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          ||  (0)
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag <tag2>
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          ||  (0)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint8 kTagParseTbl_0[] = {
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef OK
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#undef X_
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Convert GetTimeOfDay output to 64-bit usec
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static inline uint64 Microseconds(const struct timeval& t) {
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The SumReducer uses uint64, so convert to (uint64) microseconds,
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // not (double) seconds.
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return t.tv_sec * 1000000ULL + t.tv_usec;
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true if character is < > or &
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool inline IsSpecial(char c) {
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if ((c & 0xe0) == 0x20) {
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return kSpecialSymbol[static_cast<uint8>(c)];
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Quick Skip to next letter or < > & or to end of string (eos)
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Always return is_letter for eos
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int ScanToLetterOrSpecial(const char* src, int len) {
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int bytes_consumed;
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       &bytes_consumed);
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return bytes_consumed;
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// src points to non-letter, such as tag-opening '<'
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return length from here to next possible letter
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// On eos or another < before >, return 1
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag>
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          |    |
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag> ... </tag>  for <script> <style>
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          |               |
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <!-- ... <tag> ... -->
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          |                     |
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          ||  (1)
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advances <tag <tag2>
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//          ||  (1)
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int ScanToPossibleLetter(const char* isrc, int len) {
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* src = reinterpret_cast<const uint8*>(isrc);
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* srclimit = src + len;
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* tagParseTbl = kTagParseTbl_0;
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int e = 0;
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (src < srclimit) {
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    e = tagParseTbl[kCharToSub[*src++]];
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if ((e & ~1) == 0) {
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We overshot by one byte
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      --src;
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    tagParseTbl = &kTagParseTbl_0[e * 20];
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (src >= srclimit) {
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We fell off the end of the text.
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // It looks like the most common case for this is a truncated file, not
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // mismatched angle brackets. So we pretend that the last char was '>'
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return len;
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // OK to be in state 0 or state 2 at exit
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if ((e != 0) && (e != 2)) {
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Error, '<' followed by '<'
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We want to back up to first <, then advance by one byte past it
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int offset = src - reinterpret_cast<const uint8*>(isrc);
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Backscan to first '<' and return enough length to just get past it
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    --offset;   // back up over the second '<', which caused us to stop
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while ((0 < offset) && (isrc[offset] != '<')) {
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Find the first '<', which is unmatched
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      --offset;
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // skip to just beyond first '<'
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // printf("  returning %d\n", offset + 1);
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return offset + 1;
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return src - reinterpret_cast<const uint8*>(isrc);
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScriptScanner::ScriptScanner(const char* buffer,
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             int buffer_length,
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             bool is_plain_text)
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  : start_byte_(buffer),
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  next_byte_(buffer),
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  next_byte_limit_(buffer + buffer_length),
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  byte_length_(buffer_length),
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  is_plain_text_(is_plain_text) {
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    script_buffer_ = new char[getone::kMaxScriptBuffer];
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScriptScanner::~ScriptScanner() {
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] script_buffer_;
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] script_buffer_lower_;
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Get to the first real non-tag letter or entity that is a letter
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sets script of that letter
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return len if no more letters
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int sc = UNKNOWN_LSCRIPT;
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int skip = 0;
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int tlen, plen;
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Do run of non-letters (tag | &NL | NL)*
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (skip < len) {
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Do fast scan to next interesting byte
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // int oldskip = skip;
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    skip += ScanToLetterOrSpecial(src + skip, len - skip);
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // TEMP
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //       oldskip, src[oldskip], skip, src[skip]);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Check for no more letters/specials
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (skip >= len) {
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // All done
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return len;
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We are at a letter, nonletter, tag, or entity
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (IsSpecial(src[skip]) && !is_plain_text_) {
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (src[skip] == '<') {
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Begining of tag; skip to end and go around again
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        tlen = ScanToPossibleLetter(src + skip, len - skip);
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sc = 0;
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // printf("<...> ");
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else if (src[skip] == '>') {
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Unexpected end of tag; skip it and go around again
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        tlen = 1;         // Over the >
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sc = 0;
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // printf("..> ");
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else if (src[skip] == '&') {
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Expand entity, no advance
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        char temp[4];
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        EntityToBuffer(src + skip, len - skip,
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       temp, &tlen, &plen);
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sc = getone::GetUTF8LetterScriptNum(temp);
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Update 1..4 bytes
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      tlen = cld_UniLib::OneCharLen(src + skip);
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sc = getone::GetUTF8LetterScriptNum(src + skip);
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // TEMP
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // printf("sc=%d ", sc);
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (sc != 0) {break;}           // Letter found
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    skip += tlen;                   // Advance
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *script = sc;
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return skip;
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifdef NEED_ALIGNED_LOADS
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const bool kNeedsAlignedLoads = true;
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const bool kNeedsAlignedLoads = false;
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copy next run of same-script non-tag letters to buffer [NUL terminated]
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Buffer has leading space and all text is lowercased
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->text = script_buffer_;
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->text_bytes = 0;
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->offset = next_byte_ - start_byte_;
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->script = UNKNOWN_LSCRIPT;
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->lang = UNKNOWN_LANGUAGE;
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->truncated = false;
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // printf("GetOneScriptSpan[[ ");
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // struct timeval script_start, script_mid, script_end;
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int spanscript;           // The script of this span
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int sc = UNKNOWN_LSCRIPT;  // The script of next character
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int tlen, plen;
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  script_buffer_[0] = ' ';  // Always a space at front of output
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  script_buffer_[1] = '\0';
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int take = 0;
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int put = 1;              // Start after the initial space
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // gettimeofday(&script_start, NULL);
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Get to the first real non-tag letter or entity that is a letter
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  next_byte_ += skip;
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  byte_length_ -= skip;
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (byte_length_ <= 0) {
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // printf("]]\n");
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;               // No more letters to be found
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // gettimeofday(&script_mid, NULL);
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // There is at least one letter, so we know the script for this span
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // printf("{%d} ", spanscript);
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->script = (UnicodeLScript)spanscript;
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Go over alternating spans of same-script letters and non-letters,
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // copying letters to buffer with single spaces for each run of non-letters
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (take < byte_length_) {
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Copy run of letters in same script (&LS | LS)*
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int letter_count = 0;              // Keep track of word length
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    bool need_break = false;
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while (take < byte_length_) {
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We are at a letter, nonletter, tag, or entity
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // printf("\"%c\" ", next_byte_[take]);
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (next_byte_[take] == '<') {
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Begining of tag
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          sc = 0;
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break;
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else if (next_byte_[take] == '>') {
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Unexpected end of tag
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          sc = 0;
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break;
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else if (next_byte_[take] == '&') {
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Copy entity, no advance
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          EntityToBuffer(next_byte_ + take, byte_length_ - take,
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         script_buffer_ + put, &tlen, &plen);
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else {
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Real letter, safely copy up to 4 bytes, increment by 1..4
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Will update by 1..4 bytes at Advance, below
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Fast case
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          *reinterpret_cast<uint32*>(script_buffer_ + put) =
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            *reinterpret_cast<const uint32*>(next_byte_ + take);
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else {
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Slow case, happens 1-3 times per input document
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          memcpy(script_buffer_ + put, next_byte_ + take, plen);
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // printf("sc(%c)=%d ", next_byte_[take], sc);
4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Allow continue across a single letter in a different script:
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // A B D = three scripts, c = common script, i = inherited script,
4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // - = don't care, ( = take position before the += below
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(A-    continue
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(BA    continue
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(BB    break
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(Bc    continue (breaks after B)
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(BD    break
4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(Bi    break
4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(c-    break
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //  AAA(i-    continue
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //
4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Might need to break this script span
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (sc == ULScript_Common) {
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          need_break = true;
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else {
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Look at next following character, ignoring entity as Common
4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            need_break = true;
4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          }
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (need_break) {break;}  // Non-letter or letter in wrong script
4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      take += tlen;                   // Advance
4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      put += plen;                    // Advance
4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ++letter_count;
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (put >= getone::kMaxScriptBytes) {
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Buffer is full
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        span->truncated = true;
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }     // End while letters
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Do run of non-letters (tag | &NL | NL)*
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while (take < byte_length_) {
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Do fast scan to next interesting byte
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Check for no more letters/specials
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (take >= byte_length_) {
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        take = byte_length_;
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We are at a letter, nonletter, tag, or entity
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // printf("\"%c\" ", next_byte_[take]);
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (next_byte_[take] == '<') {
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Begining of tag; skip to end and go around again
4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          sc = 0;
4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // printf("<...> ");
4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else if (next_byte_[take] == '>') {
4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Unexpected end of tag; skip it and go around again
4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          tlen = 1;         // Over the >
4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          sc = 0;
4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // printf("..> ");
4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else if (next_byte_[take] == '&') {
4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Expand entity, no advance
4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          EntityToBuffer(next_byte_ + take, byte_length_ - take,
4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         script_buffer_ + put, &tlen, &plen);
5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else {
5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Update 1..4
5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        tlen = cld_UniLib::OneCharLen(next_byte_ + take);
5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // printf("sc[%c]=%d ", next_byte_[take], sc);
5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (sc != 0) {break;}           // Letter found
5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      take += tlen;                   // Advance
5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }     // End while not-letters
5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    script_buffer_[put++] = ' ';
5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We are at a letter again (or eos), after letter* not-letter*
5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (sc != spanscript) {break;}            // Letter in wrong script
5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (put >= getone::kMaxScriptBytes - 8) {
5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Buffer is almost full
5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      span->truncated = true;
5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Update input position
5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  next_byte_ += take;
5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  byte_length_ -= take;
5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                          kMaxScriptBytes |   | put
5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  script_buffer_[put + 0] = ' ';
5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  script_buffer_[put + 1] = ' ';
5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  script_buffer_[put + 2] = ' ';
5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  script_buffer_[put + 3] = '\0';
5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  span->text_bytes = put;       // Does not include the last four chars above
5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // printf(" %d]]\n\n", put);
5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Force Latin, Cyrillic, Greek scripts to be lowercase
5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // On Windows, text is lowercased beforehand, so no need to do anything here.
5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if !defined(CLD_WINDOWS)
5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If needed, lowercase all the text. If we do it sooner, might miss
5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // lowercasing an entity such as &Aacute;
5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We only need to do this for Latn and Cyrl scripts
5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if ((span->script == ULScript_Latin) ||
5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (span->script == ULScript_Cyrillic) ||
5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (span->script == ULScript_Greek)) {
5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Full Unicode lowercase of the entire buffer, including
5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // four pad bytes off the end
5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int consumed, filled;
5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UniLib::ToLower(span->text, span->text_bytes + 4,
5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    script_buffer_lower_, getone::kMaxScriptLowerBuffer,
5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    &consumed, &filled);
5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    span->text = script_buffer_lower_;
5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    span->text_bytes = filled - 4;
5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copy next run of same-script non-tag letters to buffer [NUL terminated]
5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Force Latin and Cyrillic scripts to be lowercase
5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool ok = GetOneScriptSpan(span);
5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LowerScriptSpan(span);
5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ok;
5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Gets lscript number for letters; always returns
5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   0 (common script) for non-letters
5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int getone::GetUTF8LetterScriptNum(const char* src) {
5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int srclen = cld_UniLib::OneCharLen(src);
5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* usrc = reinterpret_cast<const uint8*>(src);
5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
577