1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <string>
6#include "encodings/compact_lang_det/cldutil.h"
7
8#include "base/basictypes.h"
9#include "encodings/compact_lang_det/cldutil_dbg.h"
10#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
11#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
12#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
13#include "encodings/compact_lang_det/win/cld_logging.h"
14#include "encodings/compact_lang_det/win/cld_unilib.h"
15#include "encodings/compact_lang_det/win/cld_utf.h"
16#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
17
18// Runtime routines for hashing, looking up, and scoring
19// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
20// Unigrams and bigrams are for CJK languages only, including simplified/
21// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
22// Zhuang Han characters. Surrounding spaces are not considered.
23// Quadgrams and octagrams for for non-CJK and include two bits indicating
24// preceding and trailing spaces (word boundaries).
25
26
27// Indicator bits for leading/trailing space around quad/octagram
28// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
29// 1-, 2-, or 3-bytes each.
30static const uint32 kPreSpaceIndicator =  0x00004444;
31static const uint32 kPostSpaceIndicator = 0x44440000;
32
33// Little-endian masks for 0..24 bytes picked up as uint32's
34static const uint32 kWordMask0[4] = {
35  0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
36};
37
38static const int kMinCJKUTF8CharBytes = 3;
39
40static const int kMinGramCount = 3;
41static const int kMaxGramCount = 16;
42
43
44
45
46// Routines to access a hash table of <key:wordhash, value:probs> pairs
47// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
48// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
49// bucket subscript.
50// Probs is a packed: three languages plus a subscript for probability table
51// Buckets have all the keys together, then all the values.Key array never
52// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
53// Match case may sometimes take an additional cache miss on value access.
54//
55// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
56// byte buckets with single cache miss.
57// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
58//------------------------------------------------------------------------------
59
60
61//------------------------------------------------------------------------------
62// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
63//------------------------------------------------------------------------------
64
65// Design principles for these hash functions
66// - Few operations
67// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
68//   Latin script expect 1- and 2-byte mixtures.
69// - Last byte of each character has about 5 bits of information
70// - Spread good bits around so they can interact in at least two ways
71//   with other characters
72// - Use add for additional mixing thorugh carries
73
74// CJK Three-byte bigram
75//   ....dddd..cccccc..bbbbbb....aaaa
76//   ..................ffffff..eeeeee
77// make
78//   ....dddd..cccccc..bbbbbb....aaaa
79//   000....dddd..cccccc..bbbbbb....a
80//   ..................ffffff..eeeeee
81//   ffffff..eeeeee000000000000000000
82//
83// CJK Four-byte bigram
84//   ..dddddd..cccccc....bbbb....aaaa
85//   ..hhhhhh..gggggg....ffff....eeee
86// make
87//   ..dddddd..cccccc....bbbb....aaaa
88//   000..dddddd..cccccc....bbbb....a
89//   ..hhhhhh..gggggg....ffff....eeee
90//   ..ffff....eeee000000000000000000
91
92// BIGRAM
93// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
94// OVERSHOOTS up to 3 bytes
95// For runtime use of tables
96uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
97  if (bytecount == 0) {
98    return 0;
99  }
100  uint32 word0, word1;
101  if (bytecount <= 4) {
102    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
103    word0 = word0 ^ (word0 >> 3);
104    return word0;
105  }
106  // Else do 8 bytes
107  word0 = UnalignedLoad32(word_ptr);
108  word0 = word0 ^ (word0 >> 3);
109  word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
110  word1 = word1 ^ (word1 << 18);
111  return word0 + word1;
112}
113
114//
115// Ascii-7 One-byte chars
116//   ...ddddd...ccccc...bbbbb...aaaaa
117// make
118//   ...ddddd...ccccc...bbbbb...aaaaa
119//   000...ddddd...ccccc...bbbbb...aa
120//
121// Latin 1- and 2-byte chars
122//   ...ddddd...ccccc...bbbbb...aaaaa
123//   ...................fffff...eeeee
124// make
125//   ...ddddd...ccccc...bbbbb...aaaaa
126//   000...ddddd...ccccc...bbbbb...aa
127//   ...................fffff...eeeee
128//   ...............fffff...eeeee0000
129//
130// Non-CJK Two-byte chars
131//   ...ddddd...........bbbbb........
132//   ...hhhhh...........fffff........
133// make
134//   ...ddddd...........bbbbb........
135//   000...ddddd...........bbbbb.....
136//   ...hhhhh...........fffff........
137//   hhhh...........fffff........0000
138//
139// Non-CJK Three-byte chars
140//   ...........ccccc................
141//   ...................fffff........
142//   ...lllll...................iiiii
143// make
144//   ...........ccccc................
145//   000...........ccccc.............
146//   ...................fffff........
147//   ...............fffff........0000
148//   ...lllll...................iiiii
149//   .lllll...................iiiii00
150//
151
152// QUADGRAM
153// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
154// OVERSHOOTS up to 3 bytes
155// For runtime use of tables
156uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
157  uint32 word0, word1, word2;
158  if (bytecount <= 4) {
159    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
160    word0 = word0 ^ (word0 >> 3);
161    return word0 ^ prepost;
162  } else if (bytecount <= 8) {
163    word0 = UnalignedLoad32(word_ptr);
164    word0 = word0 ^ (word0 >> 3);
165    word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
166    word1 = word1 ^ (word1 << 4);
167    return (word0 ^ prepost) + word1;
168  }
169  // else do 12 bytes
170  word0 = UnalignedLoad32(word_ptr);
171  word0 = word0 ^ (word0 >> 3);
172  word1 = UnalignedLoad32(word_ptr + 4);
173  word1 = word1 ^ (word1 << 4);
174  word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
175  word2 = word2 ^ (word2 << 2);
176  return (word0 ^ prepost) + word1 + word2;
177}
178
179
180// QUADGRAM wrapper with surrounding spaces
181// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
182// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
183// For runtime use of tables
184uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
185  if (bytecount == 0) {
186    return 0;
187  }
188  uint32 prepost = 0;
189  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
190  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
191  return QuadHashV25Mix(word_ptr, bytecount, prepost);
192}
193
194// QUADGRAM wrapper with surrounding underscores (offline use)
195// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
196// OVERSHOOTS up to 3 bytes
197// For offline construction of tables
198uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
199  if (bytecount == 0) {
200    return 0;
201  }
202  const char* local_word_ptr = word_ptr;
203  int local_bytecount = bytecount;
204  uint32 prepost = 0;
205  if (local_word_ptr[0] == '_') {
206    prepost |= kPreSpaceIndicator;
207    ++local_word_ptr;
208    --local_bytecount;
209  }
210  if (local_word_ptr[local_bytecount - 1] == '_') {
211    prepost |= kPostSpaceIndicator;
212    --local_bytecount;
213  }
214  return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
215}
216
217
218// OCTAGRAM
219// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
220// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
221//
222// The low 32 bits follow the pattern from above, tuned to different scripts
223// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
224// For runtime use of tables V3
225uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
226  uint64 word0;
227  uint64 word1;
228  uint64 sum;
229
230  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
231  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
232  switch ((bytecount - 1) >> 2) {
233  case 0:       // 1..4 bytes
234    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
235    sum = word0;
236    word0 = word0 ^ (word0 >> 3);
237    break;
238  case 1:       // 5..8 bytes
239    word0 = UnalignedLoad32(word_ptr);
240    sum = word0;
241    word0 = word0 ^ (word0 >> 3);
242    word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
243    sum += word1;
244    word1 = word1 ^ (word1 << 4);
245    word0 += word1;
246    break;
247  case 2:       // 9..12 bytes
248    word0 = UnalignedLoad32(word_ptr);
249    sum = word0;
250    word0 = word0 ^ (word0 >> 3);
251    word1 = UnalignedLoad32(word_ptr + 4);
252    sum += word1;
253    word1 = word1 ^ (word1 << 4);
254    word0 += word1;
255    word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
256    sum += word1;
257    word1 = word1 ^ (word1 << 2);
258    word0 += word1;
259    break;
260  case 3:       // 13..16 bytes
261    word0 = UnalignedLoad32(word_ptr);
262    sum = word0;
263    word0 = word0 ^ (word0 >> 3);
264    word1 = UnalignedLoad32(word_ptr + 4);
265    sum += word1;
266    word1 = word1 ^ (word1 << 4);
267    word0 += word1;
268    word1 = UnalignedLoad32(word_ptr + 8);
269    sum += word1;
270    word1 = word1 ^ (word1 << 2);
271    word0 += word1;
272    word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
273    sum += word1;
274    word1 = word1 ^ (word1 >> 8);
275    word0 += word1;
276    break;
277  case 4:       // 17..20 bytes
278    word0 = UnalignedLoad32(word_ptr);
279    sum = word0;
280    word0 = word0 ^ (word0 >> 3);
281    word1 = UnalignedLoad32(word_ptr + 4);
282    sum += word1;
283    word1 = word1 ^ (word1 << 4);
284    word0 += word1;
285    word1 = UnalignedLoad32(word_ptr + 8);
286    sum += word1;
287    word1 = word1 ^ (word1 << 2);
288    word0 += word1;
289    word1 = UnalignedLoad32(word_ptr + 12);
290    sum += word1;
291    word1 = word1 ^ (word1 >> 8);
292    word0 += word1;
293    word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
294    sum += word1;
295    word1 = word1 ^ (word1 >> 4);
296    word0 += word1;
297    break;
298  default:      // 21..24 bytes and higher (ignores beyond 24)
299    word0 = UnalignedLoad32(&word_ptr);
300    sum = word0;
301    word0 = word0 ^ (word0 >> 3);
302    word1 = UnalignedLoad32(word_ptr + 4);
303    sum += word1;
304    word1 = word1 ^ (word1 << 4);
305    word0 += word1;
306    word1 = UnalignedLoad32(word_ptr + 8);
307    sum += word1;
308    word1 = word1 ^ (word1 << 2);
309    word0 += word1;
310    word1 = UnalignedLoad32(word_ptr + 12);
311    sum += word1;
312    word1 = word1 ^ (word1 >> 8);
313    word0 += word1;
314    word1 = UnalignedLoad32(word_ptr + 16);
315    sum += word1;
316    word1 = word1 ^ (word1 >> 4);
317    word0 += word1;
318    word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
319    sum += word1;
320    word1 = word1 ^ (word1 >> 6);
321    word0 += word1;
322    break;
323  }
324
325  sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
326  sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
327  sum = (sum & 0xff) << 32;
328  return (word0 ^ prepost) + sum;
329}
330
331// OCTAGRAM wrapper with surrounding spaces
332// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
333// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
334//
335// The low 32 bits follow the pattern from above, tuned to different scripts
336// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
337// For runtime use of tables V3
338uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
339  if (bytecount == 0) {
340    return 0;
341  }
342  uint64 prepost = 0;
343  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
344  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
345  return OctaHash40Mix(word_ptr, bytecount, prepost);
346}
347
348
349// OCTAGRAM wrapper with surrounding underscores (offline use)
350// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
351// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
352//
353// The low 32 bits follow the pattern from above, tuned to different scripts
354// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
355// For offline construction of tables
356uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
357  if (bytecount == 0) {
358    return 0;
359  }
360  const char* local_word_ptr = word_ptr;
361  int local_bytecount = bytecount;
362  uint64 prepost = 0;
363  if (local_word_ptr[0] == '_') {
364    prepost |= kPreSpaceIndicator;
365    ++local_word_ptr;
366    --local_bytecount;
367  }
368  if (local_word_ptr[local_bytecount - 1] == '_') {
369    prepost |= kPostSpaceIndicator;
370    --local_bytecount;
371  }
372  return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
373}
374
375
376
377
378//------------------------------------------------------------------------------
379// Scoring single groups of letters
380//------------------------------------------------------------------------------
381
382// UNIGRAM score one => tote
383// Input: 1-byte entry of subscript into unigram probs, plus
384//  an accumulator tote.
385// Output: running sums in tote updated
386void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
387  tote->AddGram();
388  const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
389  if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
390  if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
391  if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
392  if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
393  if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
394  if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
395}
396
397// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
398// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
399//  an accumulator tote. (language 0 means unused entry)
400// Output: running sums in tote updated
401void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
402  tote->AddGram();
403  uint8 prob123 = (probs >> 0) & 0xff;
404  const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
405
406  uint8 top1 = (probs >> 8) & 0xff;
407  if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
408  uint8 top2 = (probs >> 16) & 0xff;
409  if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
410  uint8 top3 = (probs >> 24) & 0xff;
411  if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
412}
413
414
415//------------------------------------------------------------------------------
416// Routines to accumulate probabilities
417//------------------------------------------------------------------------------
418
419
420// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
421// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
422// Score up to n unigrams, returning number of bytes consumed
423// Updates tote_grams
424int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
425                      const char* isrc, int srclen, int advance_by,
426                      int* tote_grams, int gram_limit, Tote* chunk_tote) {
427  const char* src = isrc;
428  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
429
430  // Property-based CJK unigram lookup
431  if (src[0] == ' ') {++src; --srclen;}
432
433  const uint8* usrc = reinterpret_cast<const uint8*>(src);
434  int usrclen = srclen;
435
436  while (usrclen > 0) {
437    int len = kAdvanceOneChar[usrc[0]];
438    // Look up property of one UTF-8 character and advance over it
439    // Return 0 if input length is zero
440    // Return 0 and advance one byte if input is ill-formed
441
442    int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
443
444    if (FLAGS_dbglookup) {
445      DbgUniTermToStderr(propval, usrc, len);
446    }
447
448    if (propval > 0) {
449      ProcessProbV25UniTote(propval, chunk_tote);
450      ++(*tote_grams);
451      if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
452    }
453
454    // Advance by 1/2/4/8 characters (half of quad advance)
455    if (advance_by == 2) {
456      // Already advanced by 1
457    } else if (advance_by == 4) {
458      // Advance by 2 chars total, if not at end
459      if (UTFmax <= usrclen) {
460        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
461      }
462    } else if (advance_by == 8) {
463      // Advance by 4 chars total, if not at end
464      if ((UTFmax * 3) <= usrclen) {
465        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
466        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
467        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
468      }
469    } else {
470      // Advance by 8 chars total, if not at end
471      if ((UTFmax * 7) <= usrclen) {
472        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
473        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
474        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
475        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
476        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
477        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
478        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
479      }
480    }
481    DCHECK(usrclen >= 0);
482
483    if (*tote_grams >= gram_limit) {
484      break;
485    }
486  }
487  if (FLAGS_dbgscore) {
488    // With advance_by>2, we consume more input to get the same number of quads
489    int len = src - isrc;
490    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
491    DbgScoreFlush();
492  }
493
494  int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
495  return consumed2;
496}
497
498
499// BIGRAM, using hash table, always advancing by 1 char
500// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
501// Score all bigrams in isrc, using languages that have bigrams (CJK)
502// Return number of bigrams that hit in the hash table
503int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
504                         const char* isrc, int srclen, Tote* chunk_tote) {
505  int hit_count = 0;
506  const char* src = isrc;
507
508  // Hashtable-based CJK bigram lookup
509  const uint8* usrc = reinterpret_cast<const uint8*>(src);
510  const uint8* usrclimit1 = usrc + srclen - UTFmax;
511  if (FLAGS_dbgscore) {
512    fprintf(stderr, "  " );
513  }
514
515  while (usrc < usrclimit1) {
516    int len = kAdvanceOneChar[usrc[0]];
517    int len2 = kAdvanceOneChar[usrc[len]] + len;
518
519    if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
520      // Lookup and score this bigram
521      // Always ignore pre/post spaces
522      uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
523      uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
524      // Now go indirect on the subscript
525      probs = bigram_obj->kCLDTableInd[probs &
526        ~bigram_obj->kCLDTableKeyMask];
527
528      // Process the bigram
529      if (FLAGS_dbglookup) {
530        const char* ssrc = reinterpret_cast<const char*>(usrc);
531        DbgBiTermToStderr(bihash, probs, ssrc, len2);
532        DbgScoreRecord(NULL, probs, len2);
533      } else if (FLAGS_dbgscore && (probs != 0)) {
534        const char* ssrc = reinterpret_cast<const char*>(usrc);
535        DbgScoreRecord(NULL, probs, len2);
536        string temp(ssrc, len2);
537        fprintf(stderr, "%s ", temp.c_str());
538      }
539
540      if (probs != 0) {
541        ProcessProbV25Tote(probs, chunk_tote);
542        ++hit_count;
543      }
544    }
545    usrc += len;  // Advance by one char
546  }
547
548  if (FLAGS_dbgscore) {
549    fprintf(stderr, "[%d bigrams scored]\n", hit_count);
550    DbgScoreState();
551  }
552  return hit_count;
553}
554
555
556
557// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
558// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
559// Score up to n quadgrams, returning number of bytes consumed
560// Updates tote_grams
561int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
562                       const char* isrc, int srclen, int advance_by,
563                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
564  const char* src = isrc;
565  const char* srclimit = src + srclen;
566  // Limit is end, which has extra 20 20 20 00 past len
567  const char* srclimit7 = src + srclen - (UTFmax * 7);
568  const char* srclimit15 = src + srclen - (UTFmax * 15);
569
570  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
571
572  // Run a little cache of last hits to catch overly-repetitive "text"
573  int next_prior = 0;
574  uint32 prior_quads[2] = {0, 0};
575
576  // Visit all quadgrams
577  if (src[0] == ' ') {++src;}
578  while (src < srclimit) {
579    // Find one quadgram
580    const char* src_end = src;
581    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
582    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
583    const char* src_mid = src_end;
584    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
585    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
586    int len = src_end - src;
587
588    // Lookup and score this quadgram
589    uint32 quadhash = QuadHashV25(src, len);
590    uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
591    // Now go indirect on the subscript
592    probs = quadgram_obj->kCLDTableInd[probs &
593      ~quadgram_obj->kCLDTableKeyMask];
594
595    // Process the quadgram
596    if (FLAGS_dbglookup) {
597      DbgQuadTermToStderr(quadhash, probs, src, len);
598    }
599    if (probs != 0) {
600      // Filter out recent repeats. If this works out, use in the other lookups
601      if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
602        prior_quads[next_prior] = quadhash;
603        next_prior = (next_prior + 1) & 1;
604        ProcessProbV25Tote(probs, chunk_tote);
605        ++(*tote_grams);
606        if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
607      }
608    }
609
610    // Advance all the way past word if at end-of-word
611    if (src_end[0] == ' ') {
612      src_mid = src_end;
613    }
614
615    // Advance by 2/4/8/16 characters
616    if (advance_by == 2) {
617      src = src_mid;
618    } else if (advance_by == 4) {
619      src = src_end;
620    } else if (advance_by == 8) {
621      // Advance by 8 chars total (4 more), if not at end
622      if (src < srclimit7) {
623        src_end += kAdvanceOneChar[(uint8)src_end[0]];
624        src_end += kAdvanceOneChar[(uint8)src_end[0]];
625        src_end += kAdvanceOneChar[(uint8)src_end[0]];
626        src_end += kAdvanceOneChar[(uint8)src_end[0]];
627      }
628      src = src_end;
629    } else {
630      // Advance by 16 chars total (12 more), if not at end
631      if (src < srclimit15) {
632        // Advance by ~16 chars by adding 3 * current bytelen
633        int fourcharlen = src_end - src;
634        src = src_end + (3 * fourcharlen);
635        // Advance a bit more if mid-character
636        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
637        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
638      } else {
639        src = src_end;
640      }
641    }
642    DCHECK(src < srclimit);
643    src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
644
645    if (*tote_grams >= gram_limit) {
646      break;
647    }
648  }
649
650  if (FLAGS_dbgscore) {
651    // With advance_by>2, we consume more input to get the same number of quads
652    int len = src - isrc;
653    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
654    DbgScoreFlush();
655  }
656
657  int consumed = src - isrc;
658
659  // If advancing by more than 2, src may have overshot srclimit
660  if (consumed > srclen) {
661    consumed = srclen;
662  }
663
664  return consumed;
665}
666
667
668// OCTAGRAM, using hash table, always advancing by 1 word
669// Caller supplies table, such as &kLongWord8Table_obj
670// Score all words in isrc, using languages that have quadgrams
671// We don't normally use this routine except on the first quadgram run,
672// but it can be used to resolve unreliable pages.
673// This routine does not have an optimized advance_by
674// SOON: Uses indirect language/probability longword
675//
676// Return number of words that hit in the hash table
677int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
678                       const char* isrc, int srclen, Tote* chunk_tote) {
679  int hit_count = 0;
680  const char* src = isrc;
681  const char* srclimit = src + srclen + 1;
682  // Limit is end+1, to include extra space char (0x20) off the end
683  //
684  // Score all words truncated to 8 characters
685  int charcount = 0;
686  // Skip any initial space
687  if (src[0] == ' ') {++src;}
688  const char* word_ptr = src;
689  const char* word_end = word_ptr;
690  if (FLAGS_dbgscore) {
691    fprintf(stderr, "  " );
692  }
693  while (src < srclimit) {
694    // Terminate previous word or continue current word
695    if (src[0] == ' ') {
696      int bytecount = word_end - word_ptr;
697      if (bytecount == 0)
698        break;
699      // Lookup and score this word
700      uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
701      uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
702      // Now go indirect on the subscript
703      probs = octagram_obj->kCLDTableInd[probs &
704        ~octagram_obj->kCLDTableKeyMask];
705
706      // // Lookup and score this word
707      // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
708      // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
709      //                                kLongWord8TableSize);
710      //
711      if (FLAGS_dbglookup) {
712        DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
713        DbgScoreRecord(NULL, probs, bytecount);
714      } else if (FLAGS_dbgscore && (probs != 0)) {
715        DbgScoreRecord(NULL, probs, bytecount);
716        string temp(word_ptr, bytecount);
717        fprintf(stderr, "%s ", temp.c_str());
718      }
719
720      if (probs != 0) {
721        ProcessProbV25Tote(probs, chunk_tote);
722        ++hit_count;
723      }
724      charcount = 0;
725      word_ptr = src + 1;   // Over the space
726      word_end = word_ptr;
727    } else {
728      ++charcount;
729    }
730
731    // Advance to next char
732    src += cld_UniLib::OneCharLen(src);
733    if (charcount <= 8) {
734      word_end = src;
735    }
736  }
737
738  if (FLAGS_dbgscore) {
739    fprintf(stderr, "[%d words scored]\n", hit_count);
740    DbgScoreState();
741  }
742  return hit_count;
743}
744
745
746
747//------------------------------------------------------------------------------
748// Reliability calculations, for single language and between languages
749//------------------------------------------------------------------------------
750
751// Return reliablity of result 0..100 for top two scores
752// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
753// (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
754// Threshold is uni/quadgram increment count, bounded above and below.
755//
756// Requiring a factor of 3 improvement (e.g. +1 log base 3)
757// for each scored quadgram is too stringent, so I've backed this off to a
758// factor of 2 (e.g. +5/8 log base 3).
759//
760// I also somewhat lowered the Min/MaxGramCount limits above
761//
762// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
763//
764int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
765  int max_reliability_percent = 100;
766  if (gramcount < 8) {
767    max_reliability_percent = 12 * gramcount;
768  }
769  int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
770  if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
771    fully_reliable_thresh = kMinGramCount;
772  } else if (fully_reliable_thresh > kMaxGramCount) {
773    fully_reliable_thresh = kMaxGramCount;
774  }
775
776  int delta = value1 - value2;
777  if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
778  if (delta <= 0) {return 0;}
779  return cld::minint(max_reliability_percent,
780                     (100 * delta) / fully_reliable_thresh);
781}
782
783// Return reliablity of result 0..100 for top score vs. mainsteam score
784// Values are score per 1024 bytes of input
785// ratio = max(top/mainstream, mainstream/top)
786// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
787// Change: short-text word scoring can give unusually good results.
788//  Let top exceed mainstream by 4x at 50% reliable
789int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
790  if (mean_score == 0) {return 100;}    // No reliability data available yet
791  if (topscore == 0) {return 0;}        // zero score = unreliable
792  if (len == 0) {return 0;}             // zero len = unreliable
793  int top_kb = (topscore << 10) / len;
794  double ratio;
795  double ratio_cutoff;
796  if (top_kb > mean_score) {
797    ratio = (1.0 * top_kb) / mean_score;
798    ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
799  } else {
800    ratio = (1.0 * mean_score) / top_kb;
801    ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
802  }
803  if (ratio <= ratio_cutoff - 2.0) {return 100;}
804  if (ratio > ratio_cutoff) {return 0;}
805
806  int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
807  return iratio;
808}
809
810// Calculate ratio of score per 1KB vs. expected score per 1KB
811double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
812                          int bytes, int score) {
813  // Average training-data score for this language-script combo, per 1KB
814  int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
815  if (lscript == ULScript_Common) {
816    // We don't know the script (only happens with second-chance score)
817    // Look for first non-zero mean value
818    for (int i = 2; i >= 0; --i) {
819      if (kMeanScore[lang * 4 + i] > 0) {
820        expected_score = kMeanScore[lang * 4 + i];
821        break;
822      }
823    }
824  }
825  if (expected_score < 100) {
826      expected_score = 1000;
827  }
828
829  // Our score per 1KB
830  double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
831  double ratio = our_score / expected_score;
832
833  // Just the raw count normalized as though each language has mean=1000;
834  ratio = (score * 1000.0) /  expected_score;
835  return ratio;
836}
837
838// Calculate reliablity of len bytes of script lscript with chunk_tote
839int cld::GetReliability(int len, UnicodeLScript lscript,
840                   const Tote* chunk_tote) {
841  Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
842  // Average score for this language-script combo
843  int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
844  if (lscript == ULScript_Common) {
845    // We don't know the script (only happens with second-chance score)
846    // Look for first non-zero mean value
847    for (int i = 2; i >= 0; --i) {
848      if (kMeanScore[cur_lang * 4 + i] > 0) {
849        mean_score = kMeanScore[cur_lang * 4 + i];
850        break;
851      }
852    }
853  }
854  int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
855                                           chunk_tote->Value(1),
856                                           chunk_tote->GetGramCount());
857
858  int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
859                                               len,
860                                               mean_score);
861
862  int reliability_min = minint(reliability_delta, reliability_main);
863
864
865  if (FLAGS_dbgreli) {
866    char temp1[4];
867    char temp2[4];
868    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
869    if (temp1[2] == ' ') {temp1[2] = '\0';}
870    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
871    if (temp2[2] == ' ') {temp2[2] = '\0';}
872    int srclen = len;
873    fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
874                   "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
875                   "lang/lscript %d %d\n",
876           chunk_tote->GetGramCount(),
877           chunk_tote->GetIncrCount(),
878           srclen,
879           temp1, chunk_tote->Value(0),
880           temp2, chunk_tote->Value(1),
881           (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
882           mean_score,
883           reliability_delta,
884           reliability_main,
885           cur_lang, lscript);
886  }
887
888  return reliability_min;
889}
890
891
892//------------------------------------------------------------------------------
893// Miscellaneous
894//------------------------------------------------------------------------------
895
896// Demote all languages except Top40 and plus_one
897// Do this just before sorting chunk_tote results
898void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
899  for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
900    if (chunk_tote->Key(sub) == 0) continue;
901    if (chunk_tote->Key(sub) == packed_plus_one) continue;
902    if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
903    // Quarter the score of others
904    chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
905  }
906}
907