1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "encodings/compact_lang_det/getonescriptspan.h"
6#include <stdio.h>
7#include <string.h>
8
9#include "base/basictypes.h"
10#include "encodings/lang_enc.h"
11#include "encodings/compact_lang_det/utf8propjustletter.h"
12#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
13#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
14
15#include "encodings/compact_lang_det/win/cld_basictypes.h"
16#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
17#include "encodings/compact_lang_det/win/cld_google.h"
18#include "encodings/compact_lang_det/win/cld_htmlutils.h"
19#include "encodings/compact_lang_det/win/cld_unilib.h"
20#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
21#include "encodings/compact_lang_det/win/cld_utf8utils.h"
22
23static const Language GRAY_LANG = (Language)254;
24
25static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
26                                                  // else make shorter
27static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
28                                                  // to round to word boundary,
29                                                  // direction above
30
31static const char kSpecialSymbol[256] = {       // true for < > &
32  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
33  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
34  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
35  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
36
37  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
40  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
41};
42
43
44
45#define LT 0      // <
46#define GT 1      // >
47#define EX 2      // !
48#define HY 3      // -
49#define QU 4      // "
50#define AP 5      // '
51#define SL 6      // /
52#define S_ 7
53#define C_ 8
54#define R_ 9
55#define I_ 10
56#define P_ 11
57#define T_ 12
58#define Y_ 13
59#define L_ 14
60#define E_ 15
61#define CR 16     // <cr> or <lf>
62#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
63#define PL 18     // possible letter, incl. &
64#define xx 19     // <unused>
65
66// Map byte to one of ~20 interesting categories for cheap tag parsing
67static const uint8 kCharToSub[256] = {
68  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
69  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
70  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
71  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
72
73  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
74  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
75  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
76  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
77
78  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
80  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
81  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
82
83  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
84  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
85  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
86  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
87};
88
89#undef LT
90#undef GT
91#undef EX
92#undef HY
93#undef QU
94#undef AP
95#undef SL
96#undef S_
97#undef C_
98#undef R_
99#undef I_
100#undef P_
101#undef T_
102#undef Y_
103#undef L_
104#undef E_
105#undef CR
106#undef NL
107#undef PL
108#undef xx
109
110
111#define OK 0
112#define X_ 1
113
114// State machine to do cheap parse of non-letter strings incl. tags
115// advances <tag>
116//          |    |
117// advances <tag> ... </tag>  for <script> <style>
118//          |               |
119// advances <!-- ... <tag> ... -->
120//          |                     |
121// advances <tag
122//          ||  (0)
123// advances <tag <tag2>
124//          ||  (0)
125static const uint8 kTagParseTbl_0[] = {
126// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
127   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
128  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
129   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
130  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
131  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
132  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
133   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
134   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
135   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
136  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
137  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
138  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
139  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
140
141// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
142  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
143  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
144  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
145  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
146  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
147  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
148  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
149  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
150  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
151  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
152  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
153  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
154  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
155  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
156  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
157
158// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
159  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
160  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
161  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
162  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
163  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
164  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
165  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
166  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
167  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
168  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
169  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
170  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
171};
172
173#undef OK
174#undef X_
175
176
177/*
178// Convert GetTimeOfDay output to 64-bit usec
179static inline uint64 Microseconds(const struct timeval& t) {
180  // The SumReducer uses uint64, so convert to (uint64) microseconds,
181  // not (double) seconds.
182  return t.tv_sec * 1000000ULL + t.tv_usec;
183}
184*/
185
186
187// Returns true if character is < > or &
188bool inline IsSpecial(char c) {
189  if ((c & 0xe0) == 0x20) {
190    return kSpecialSymbol[static_cast<uint8>(c)];
191  }
192  return false;
193}
194
195// Quick Skip to next letter or < > & or to end of string (eos)
196// Always return is_letter for eos
197int ScanToLetterOrSpecial(const char* src, int len) {
198  int bytes_consumed;
199  cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
200                       &bytes_consumed);
201  return bytes_consumed;
202}
203
204
205
206// src points to non-letter, such as tag-opening '<'
207// Return length from here to next possible letter
208// On eos or another < before >, return 1
209// advances <tag>
210//          |    |
211// advances <tag> ... </tag>  for <script> <style>
212//          |               |
213// advances <!-- ... <tag> ... -->
214//          |                     |
215// advances <tag
216//          ||  (1)
217// advances <tag <tag2>
218//          ||  (1)
219int ScanToPossibleLetter(const char* isrc, int len) {
220  const uint8* src = reinterpret_cast<const uint8*>(isrc);
221  const uint8* srclimit = src + len;
222  const uint8* tagParseTbl = kTagParseTbl_0;
223  int e = 0;
224  while (src < srclimit) {
225    e = tagParseTbl[kCharToSub[*src++]];
226    if ((e & ~1) == 0) {
227      // We overshot by one byte
228      --src;
229      break;
230    }
231    tagParseTbl = &kTagParseTbl_0[e * 20];
232  }
233
234  if (src >= srclimit) {
235    // We fell off the end of the text.
236    // It looks like the most common case for this is a truncated file, not
237    // mismatched angle brackets. So we pretend that the last char was '>'
238    return len;
239  }
240
241  // OK to be in state 0 or state 2 at exit
242  if ((e != 0) && (e != 2)) {
243    // Error, '<' followed by '<'
244    // We want to back up to first <, then advance by one byte past it
245    int offset = src - reinterpret_cast<const uint8*>(isrc);
246    // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
247
248    // Backscan to first '<' and return enough length to just get past it
249    --offset;   // back up over the second '<', which caused us to stop
250    while ((0 < offset) && (isrc[offset] != '<')) {
251      // Find the first '<', which is unmatched
252      --offset;
253    }
254    // skip to just beyond first '<'
255    // printf("  returning %d\n", offset + 1);
256    return offset + 1;
257  }
258
259  return src - reinterpret_cast<const uint8*>(isrc);
260}
261
262
263
264ScriptScanner::ScriptScanner(const char* buffer,
265                             int buffer_length,
266                             bool is_plain_text)
267  : start_byte_(buffer),
268  next_byte_(buffer),
269  next_byte_limit_(buffer + buffer_length),
270  byte_length_(buffer_length),
271  is_plain_text_(is_plain_text) {
272    script_buffer_ = new char[getone::kMaxScriptBuffer];
273    script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
274}
275
276ScriptScanner::~ScriptScanner() {
277  delete[] script_buffer_;
278  delete[] script_buffer_lower_;
279}
280
281
282
283
284// Get to the first real non-tag letter or entity that is a letter
285// Sets script of that letter
286// Return len if no more letters
287int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
288  int sc = UNKNOWN_LSCRIPT;
289  int skip = 0;
290  int tlen, plen;
291
292  // Do run of non-letters (tag | &NL | NL)*
293  while (skip < len) {
294    // Do fast scan to next interesting byte
295    // int oldskip = skip;
296    skip += ScanToLetterOrSpecial(src + skip, len - skip);
297    // TEMP
298    // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
299    //       oldskip, src[oldskip], skip, src[skip]);
300
301    // Check for no more letters/specials
302    if (skip >= len) {
303      // All done
304      return len;
305    }
306
307    // We are at a letter, nonletter, tag, or entity
308    if (IsSpecial(src[skip]) && !is_plain_text_) {
309      if (src[skip] == '<') {
310        // Begining of tag; skip to end and go around again
311        tlen = ScanToPossibleLetter(src + skip, len - skip);
312        sc = 0;
313        // printf("<...> ");
314      } else if (src[skip] == '>') {
315        // Unexpected end of tag; skip it and go around again
316        tlen = 1;         // Over the >
317        sc = 0;
318        // printf("..> ");
319      } else if (src[skip] == '&') {
320        // Expand entity, no advance
321        char temp[4];
322        EntityToBuffer(src + skip, len - skip,
323                       temp, &tlen, &plen);
324        sc = getone::GetUTF8LetterScriptNum(temp);
325        // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
326      }
327    } else {
328      // Update 1..4 bytes
329      tlen = cld_UniLib::OneCharLen(src + skip);
330      sc = getone::GetUTF8LetterScriptNum(src + skip);
331      // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
332    }
333    // TEMP
334    // printf("sc=%d ", sc);
335    if (sc != 0) {break;}           // Letter found
336    skip += tlen;                   // Advance
337  }
338
339  *script = sc;
340  return skip;
341}
342
343#ifdef NEED_ALIGNED_LOADS
344static const bool kNeedsAlignedLoads = true;
345#else
346static const bool kNeedsAlignedLoads = false;
347#endif
348
349
350// Copy next run of same-script non-tag letters to buffer [NUL terminated]
351// Buffer has leading space and all text is lowercased
352bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
353  span->text = script_buffer_;
354  span->text_bytes = 0;
355  span->offset = next_byte_ - start_byte_;
356  span->script = UNKNOWN_LSCRIPT;
357  span->lang = UNKNOWN_LANGUAGE;
358  span->truncated = false;
359
360  // printf("GetOneScriptSpan[[ ");
361  // struct timeval script_start, script_mid, script_end;
362
363  int spanscript;           // The script of this span
364  int sc = UNKNOWN_LSCRIPT;  // The script of next character
365  int tlen, plen;
366
367
368  script_buffer_[0] = ' ';  // Always a space at front of output
369  script_buffer_[1] = '\0';
370  int take = 0;
371  int put = 1;              // Start after the initial space
372
373  // gettimeofday(&script_start, NULL);
374  // Get to the first real non-tag letter or entity that is a letter
375  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
376  next_byte_ += skip;
377  byte_length_ -= skip;
378  if (byte_length_ <= 0) {
379    // printf("]]\n");
380    return false;               // No more letters to be found
381  }
382
383  // gettimeofday(&script_mid, NULL);
384
385  // There is at least one letter, so we know the script for this span
386  // printf("{%d} ", spanscript);
387  span->script = (UnicodeLScript)spanscript;
388
389
390  // Go over alternating spans of same-script letters and non-letters,
391  // copying letters to buffer with single spaces for each run of non-letters
392  while (take < byte_length_) {
393    // Copy run of letters in same script (&LS | LS)*
394    int letter_count = 0;              // Keep track of word length
395    bool need_break = false;
396    while (take < byte_length_) {
397      // We are at a letter, nonletter, tag, or entity
398      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
399        // printf("\"%c\" ", next_byte_[take]);
400        if (next_byte_[take] == '<') {
401          // Begining of tag
402          sc = 0;
403          break;
404        } else if (next_byte_[take] == '>') {
405          // Unexpected end of tag
406          sc = 0;
407          break;
408        } else if (next_byte_[take] == '&') {
409          // Copy entity, no advance
410          EntityToBuffer(next_byte_ + take, byte_length_ - take,
411                         script_buffer_ + put, &tlen, &plen);
412          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
413        }
414      } else {
415        // Real letter, safely copy up to 4 bytes, increment by 1..4
416        // Will update by 1..4 bytes at Advance, below
417        tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
418        if (!kNeedsAlignedLoads && (take < (byte_length_ - 3))) {
419          // Fast case
420          *reinterpret_cast<uint32*>(script_buffer_ + put) =
421            *reinterpret_cast<const uint32*>(next_byte_ + take);
422        } else {
423          // Slow case, happens 1-3 times per input document
424          memcpy(script_buffer_ + put, next_byte_ + take, plen);
425        }
426        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
427      }
428      // printf("sc(%c)=%d ", next_byte_[take], sc);
429      // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
430      // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
431
432      // Allow continue across a single letter in a different script:
433      // A B D = three scripts, c = common script, i = inherited script,
434      // - = don't care, ( = take position before the += below
435      //  AAA(A-    continue
436      //
437      //  AAA(BA    continue
438      //  AAA(BB    break
439      //  AAA(Bc    continue (breaks after B)
440      //  AAA(BD    break
441      //  AAA(Bi    break
442      //
443      //  AAA(c-    break
444      //
445      //  AAA(i-    continue
446      //
447
448      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
449        // Might need to break this script span
450        if (sc == ULScript_Common) {
451          need_break = true;
452        } else {
453          // Look at next following character, ignoring entity as Common
454          int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
455          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
456            need_break = true;
457          }
458        }
459      }
460      if (need_break) {break;}  // Non-letter or letter in wrong script
461
462      take += tlen;                   // Advance
463      put += plen;                    // Advance
464      ++letter_count;
465      if (put >= getone::kMaxScriptBytes) {
466        // Buffer is full
467        span->truncated = true;
468        break;
469      }
470    }     // End while letters
471
472    // Do run of non-letters (tag | &NL | NL)*
473    while (take < byte_length_) {
474      // Do fast scan to next interesting byte
475      take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
476
477      // Check for no more letters/specials
478      if (take >= byte_length_) {
479        take = byte_length_;
480        break;
481      }
482
483      // We are at a letter, nonletter, tag, or entity
484      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
485        // printf("\"%c\" ", next_byte_[take]);
486        if (next_byte_[take] == '<') {
487          // Begining of tag; skip to end and go around again
488          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
489          sc = 0;
490          // printf("<...> ");
491        } else if (next_byte_[take] == '>') {
492          // Unexpected end of tag; skip it and go around again
493          tlen = 1;         // Over the >
494          sc = 0;
495          // printf("..> ");
496        } else if (next_byte_[take] == '&') {
497          // Expand entity, no advance
498          EntityToBuffer(next_byte_ + take, byte_length_ - take,
499                         script_buffer_ + put, &tlen, &plen);
500          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
501        }
502      } else {
503        // Update 1..4
504        tlen = cld_UniLib::OneCharLen(next_byte_ + take);
505        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
506      }
507      // printf("sc[%c]=%d ", next_byte_[take], sc);
508      if (sc != 0) {break;}           // Letter found
509      take += tlen;                   // Advance
510    }     // End while not-letters
511
512    script_buffer_[put++] = ' ';
513
514    // We are at a letter again (or eos), after letter* not-letter*
515    if (sc != spanscript) {break;}            // Letter in wrong script
516    if (put >= getone::kMaxScriptBytes - 8) {
517      // Buffer is almost full
518      span->truncated = true;
519      break;
520    }
521  }
522
523  // Update input position
524  next_byte_ += take;
525  byte_length_ -= take;
526
527  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
528  //                          kMaxScriptBytes |   | put
529  script_buffer_[put + 0] = ' ';
530  script_buffer_[put + 1] = ' ';
531  script_buffer_[put + 2] = ' ';
532  script_buffer_[put + 3] = '\0';
533
534  span->text_bytes = put;       // Does not include the last four chars above
535
536  // printf(" %d]]\n\n", put);
537  return true;
538}
539
540// Force Latin, Cyrillic, Greek scripts to be lowercase
541void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
542  // On Windows, text is lowercased beforehand, so no need to do anything here.
543#if !defined(CLD_WINDOWS)
544  // If needed, lowercase all the text. If we do it sooner, might miss
545  // lowercasing an entity such as &Aacute;
546  // We only need to do this for Latn and Cyrl scripts
547  if ((span->script == ULScript_Latin) ||
548      (span->script == ULScript_Cyrillic) ||
549      (span->script == ULScript_Greek)) {
550    // Full Unicode lowercase of the entire buffer, including
551    // four pad bytes off the end
552    int consumed, filled;
553    UniLib::ToLower(span->text, span->text_bytes + 4,
554                    script_buffer_lower_, getone::kMaxScriptLowerBuffer,
555                    &consumed, &filled);
556    span->text = script_buffer_lower_;
557    span->text_bytes = filled - 4;
558  }
559#endif
560}
561
562// Copy next run of same-script non-tag letters to buffer [NUL terminated]
563// Force Latin and Cyrillic scripts to be lowercase
564bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
565  bool ok = GetOneScriptSpan(span);
566  LowerScriptSpan(span);
567  return ok;
568}
569
570// Gets lscript number for letters; always returns
571//   0 (common script) for non-letters
572int getone::GetUTF8LetterScriptNum(const char* src) {
573  int srclen = cld_UniLib::OneCharLen(src);
574  const uint8* usrc = reinterpret_cast<const uint8*>(src);
575  return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
576}
577