15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2009 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/cldutil.h"
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/cldutil_dbg.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_logging.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_unilib.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Runtime routines for hashing, looking up, and scoring
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Unigrams and bigrams are for CJK languages only, including simplified/
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Zhuang Han characters. Surrounding spaces are not considered.
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Quadgrams and octagrams for for non-CJK and include two bits indicating
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// preceding and trailing spaces (word boundaries).
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Indicator bits for leading/trailing space around quad/octagram
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1-, 2-, or 3-bytes each.
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kPreSpaceIndicator =  0x00004444;
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kPostSpaceIndicator = 0x44440000;
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Little-endian masks for 0..24 bytes picked up as uint32's
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kWordMask0[4] = {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMinCJKUTF8CharBytes = 3;
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMinGramCount = 3;
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxGramCount = 16;
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Routines to access a hash table of <key:wordhash, value:probs> pairs
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// bucket subscript.
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Probs is a packed: three languages plus a subscript for probability table
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Buckets have all the keys together, then all the values.Key array never
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Match case may sometimes take an additional cache miss on value access.
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// byte buckets with single cache miss.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Design principles for these hash functions
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Few operations
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   Latin script expect 1- and 2-byte mixtures.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Last byte of each character has about 5 bits of information
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Spread good bits around so they can interact in at least two ways
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   with other characters
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Use add for additional mixing thorugh carries
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CJK Three-byte bigram
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ....dddd..cccccc..bbbbbb....aaaa
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..................ffffff..eeeeee
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ....dddd..cccccc..bbbbbb....aaaa
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   000....dddd..cccccc..bbbbbb....a
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..................ffffff..eeeeee
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ffffff..eeeeee000000000000000000
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CJK Four-byte bigram
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..dddddd..cccccc....bbbb....aaaa
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..hhhhhh..gggggg....ffff....eeee
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..dddddd..cccccc....bbbb....aaaa
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   000..dddddd..cccccc....bbbb....a
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..hhhhhh..gggggg....ffff....eeee
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ..ffff....eeee000000000000000000
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BIGRAM
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OVERSHOOTS up to 3 bytes
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount == 0) {
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return 0;
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 word0, word1;
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount <= 4) {
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return word0;
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Else do 8 bytes
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word0 = UnalignedLoad32(word_ptr);
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word0 = word0 ^ (word0 >> 3);
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word1 = word1 ^ (word1 << 18);
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return word0 + word1;
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Ascii-7 One-byte chars
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...ddddd...ccccc...bbbbb...aaaaa
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...ddddd...ccccc...bbbbb...aaaaa
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   000...ddddd...ccccc...bbbbb...aa
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Latin 1- and 2-byte chars
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...ddddd...ccccc...bbbbb...aaaaa
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...................fffff...eeeee
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...ddddd...ccccc...bbbbb...aaaaa
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   000...ddddd...ccccc...bbbbb...aa
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...................fffff...eeeee
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...............fffff...eeeee0000
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Non-CJK Two-byte chars
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...ddddd...........bbbbb........
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...hhhhh...........fffff........
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...ddddd...........bbbbb........
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   000...ddddd...........bbbbb.....
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...hhhhh...........fffff........
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   hhhh...........fffff........0000
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Non-CJK Three-byte chars
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...........ccccc................
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...................fffff........
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...lllll...................iiiii
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...........ccccc................
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   000...........ccccc.............
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...................fffff........
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...............fffff........0000
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ...lllll...................iiiii
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   .lllll...................iiiii00
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OVERSHOOTS up to 3 bytes
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 word0, word1, word2;
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount <= 4) {
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return word0 ^ prepost;
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else if (bytecount <= 8) {
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr);
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 4);
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return (word0 ^ prepost) + word1;
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // else do 12 bytes
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word0 = UnalignedLoad32(word_ptr);
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word0 = word0 ^ (word0 >> 3);
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word1 = UnalignedLoad32(word_ptr + 4);
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word1 = word1 ^ (word1 << 4);
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word2 = word2 ^ (word2 << 2);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return (word0 ^ prepost) + word1 + word2;
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM wrapper with surrounding spaces
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount == 0) {
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return 0;
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 prepost = 0;
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return QuadHashV25Mix(word_ptr, bytecount, prepost);
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM wrapper with surrounding underscores (offline use)
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OVERSHOOTS up to 3 bytes
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For offline construction of tables
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount == 0) {
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return 0;
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* local_word_ptr = word_ptr;
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int local_bytecount = bytecount;
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 prepost = 0;
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (local_word_ptr[0] == '_') {
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prepost |= kPreSpaceIndicator;
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++local_word_ptr;
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    --local_bytecount;
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (local_word_ptr[local_bytecount - 1] == '_') {
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prepost |= kPostSpaceIndicator;
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    --local_bytecount;
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The low 32 bits follow the pattern from above, tuned to different scripts
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables V3
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint64 word0;
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint64 word1;
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint64 sum;
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch ((bytecount - 1) >> 2) {
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  case 0:       // 1..4 bytes
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum = word0;
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    break;
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  case 1:       // 5..8 bytes
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr);
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum = word0;
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 4);
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    break;
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  case 2:       // 9..12 bytes
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr);
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum = word0;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 4);
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 4);
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 2);
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    break;
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  case 3:       // 13..16 bytes
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr);
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum = word0;
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 4);
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 4);
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 8);
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 2);
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 >> 8);
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    break;
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  case 4:       // 17..20 bytes
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(word_ptr);
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum = word0;
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 4);
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 4);
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 8);
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 2);
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 12);
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 >> 8);
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 >> 4);
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    break;
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  default:      // 21..24 bytes and higher (ignores beyond 24)
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = UnalignedLoad32(&word_ptr);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum = word0;
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 = word0 ^ (word0 >> 3);
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 4);
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 4);
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 8);
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 << 2);
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 12);
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 >> 8);
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 16);
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 >> 4);
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    sum += word1;
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word1 = word1 ^ (word1 >> 6);
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word0 += word1;
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    break;
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sum = (sum & 0xff) << 32;
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return (word0 ^ prepost) + sum;
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM wrapper with surrounding spaces
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The low 32 bits follow the pattern from above, tuned to different scripts
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables V3
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount == 0) {
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return 0;
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint64 prepost = 0;
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return OctaHash40Mix(word_ptr, bytecount, prepost);
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM wrapper with surrounding underscores (offline use)
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The low 32 bits follow the pattern from above, tuned to different scripts
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For offline construction of tables
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (bytecount == 0) {
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return 0;
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* local_word_ptr = word_ptr;
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int local_bytecount = bytecount;
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint64 prepost = 0;
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (local_word_ptr[0] == '_') {
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prepost |= kPreSpaceIndicator;
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++local_word_ptr;
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    --local_bytecount;
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (local_word_ptr[local_bytecount - 1] == '_') {
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prepost |= kPostSpaceIndicator;
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    --local_bytecount;
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scoring single groups of letters
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNIGRAM score one => tote
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Input: 1-byte entry of subscript into unigram probs, plus
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  an accumulator tote.
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Output: running sums in tote updated
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  tote->AddGram();
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  an accumulator tote. (language 0 means unused entry)
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Output: running sums in tote updated
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  tote->AddGram();
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 prob123 = (probs >> 0) & 0xff;
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 top1 = (probs >> 8) & 0xff;
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 top2 = (probs >> 16) & 0xff;
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 top3 = (probs >> 24) & 0xff;
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Routines to accumulate probabilities
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score up to n unigrams, returning number of bytes consumed
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Updates tote_grams
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      const char* isrc, int srclen, int advance_by,
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      int* tote_grams, int gram_limit, Tote* chunk_tote) {
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* src = isrc;
4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Property-based CJK unigram lookup
4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (src[0] == ' ') {++src; --srclen;}
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* usrc = reinterpret_cast<const uint8*>(src);
4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int usrclen = srclen;
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (usrclen > 0) {
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int len = kAdvanceOneChar[usrc[0]];
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Look up property of one UTF-8 character and advance over it
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Return 0 if input length is zero
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Return 0 and advance one byte if input is ill-formed
4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (FLAGS_dbglookup) {
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      DbgUniTermToStderr(propval, usrc, len);
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (propval > 0) {
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ProcessProbV25UniTote(propval, chunk_tote);
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ++(*tote_grams);
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Advance by 1/2/4/8 characters (half of quad advance)
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (advance_by == 2) {
4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Already advanced by 1
4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else if (advance_by == 4) {
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Advance by 2 chars total, if not at end
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (UTFmax <= usrclen) {
4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else if (advance_by == 8) {
4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Advance by 4 chars total, if not at end
4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((UTFmax * 3) <= usrclen) {
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Advance by 8 chars total, if not at end
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((UTFmax * 7) <= usrclen) {
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DCHECK(usrclen >= 0);
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (*tote_grams >= gram_limit) {
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {
4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // With advance_by>2, we consume more input to get the same number of quads
4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int len = src - isrc;
4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DbgScoreFlush();
4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return consumed2;
4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BIGRAM, using hash table, always advancing by 1 char
5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score all bigrams in isrc, using languages that have bigrams (CJK)
5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return number of bigrams that hit in the hash table
5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         const char* isrc, int srclen, Tote* chunk_tote) {
5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int hit_count = 0;
5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* src = isrc;
5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Hashtable-based CJK bigram lookup
5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* usrc = reinterpret_cast<const uint8*>(src);
5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* usrclimit1 = usrc + srclen - UTFmax;
5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {
5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "  " );
5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (usrc < usrclimit1) {
5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int len = kAdvanceOneChar[usrc[0]];
5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int len2 = kAdvanceOneChar[usrc[len]] + len;
5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Lookup and score this bigram
5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Always ignore pre/post spaces
5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Now go indirect on the subscript
5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      probs = bigram_obj->kCLDTableInd[probs &
5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ~bigram_obj->kCLDTableKeyMask];
5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Process the bigram
5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (FLAGS_dbglookup) {
5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        const char* ssrc = reinterpret_cast<const char*>(usrc);
5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        DbgBiTermToStderr(bihash, probs, ssrc, len2);
5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        DbgScoreRecord(NULL, probs, len2);
5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else if (FLAGS_dbgscore && (probs != 0)) {
5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        const char* ssrc = reinterpret_cast<const char*>(usrc);
5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        DbgScoreRecord(NULL, probs, len2);
5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        string temp(ssrc, len2);
5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fprintf(stderr, "%s ", temp.c_str());
5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (probs != 0) {
5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ProcessProbV25Tote(probs, chunk_tote);
5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ++hit_count;
5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    usrc += len;  // Advance by one char
5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {
5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "[%d bigrams scored]\n", hit_count);
5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DbgScoreState();
5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return hit_count;
5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score up to n quadgrams, returning number of bytes consumed
5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Updates tote_grams
5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       const char* isrc, int srclen, int advance_by,
5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* src = isrc;
5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* srclimit = src + srclen;
5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Limit is end, which has extra 20 20 20 00 past len
5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* srclimit7 = src + srclen - (UTFmax * 7);
5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* srclimit15 = src + srclen - (UTFmax * 15);
5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Run a little cache of last hits to catch overly-repetitive "text"
5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int next_prior = 0;
5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 prior_quads[2] = {0, 0};
5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Visit all quadgrams
5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (src[0] == ' ') {++src;}
5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (src < srclimit) {
5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Find one quadgram
5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* src_end = src;
5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* src_mid = src_end;
5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int len = src_end - src;
5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Lookup and score this quadgram
5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 quadhash = QuadHashV25(src, len);
5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Now go indirect on the subscript
5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    probs = quadgram_obj->kCLDTableInd[probs &
5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ~quadgram_obj->kCLDTableKeyMask];
5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Process the quadgram
5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (FLAGS_dbglookup) {
5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      DbgQuadTermToStderr(quadhash, probs, src, len);
5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (probs != 0) {
6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Filter out recent repeats. If this works out, use in the other lookups
6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        prior_quads[next_prior] = quadhash;
6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        next_prior = (next_prior + 1) & 1;
6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ProcessProbV25Tote(probs, chunk_tote);
6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ++(*tote_grams);
6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Advance all the way past word if at end-of-word
6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (src_end[0] == ' ') {
6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      src_mid = src_end;
6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Advance by 2/4/8/16 characters
6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (advance_by == 2) {
6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      src = src_mid;
6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else if (advance_by == 4) {
6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      src = src_end;
6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else if (advance_by == 8) {
6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Advance by 8 chars total (4 more), if not at end
6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (src < srclimit7) {
6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src_end += kAdvanceOneChar[(uint8)src_end[0]];
6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src_end += kAdvanceOneChar[(uint8)src_end[0]];
6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src_end += kAdvanceOneChar[(uint8)src_end[0]];
6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src_end += kAdvanceOneChar[(uint8)src_end[0]];
6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      src = src_end;
6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Advance by 16 chars total (12 more), if not at end
6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (src < srclimit15) {
6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Advance by ~16 chars by adding 3 * current bytelen
6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        int fourcharlen = src_end - src;
6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src = src_end + (3 * fourcharlen);
6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Advance a bit more if mid-character
6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else {
6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        src = src_end;
6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DCHECK(src < srclimit);
6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (*tote_grams >= gram_limit) {
6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {
6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // With advance_by>2, we consume more input to get the same number of quads
6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int len = src - isrc;
6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DbgScoreFlush();
6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int consumed = src - isrc;
6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If advancing by more than 2, src may have overshot srclimit
6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (consumed > srclen) {
6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    consumed = srclen;
6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return consumed;
6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM, using hash table, always advancing by 1 word
6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as &kLongWord8Table_obj
6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score all words in isrc, using languages that have quadgrams
6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We don't normally use this routine except on the first quadgram run,
6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but it can be used to resolve unreliable pages.
6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This routine does not have an optimized advance_by
6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SOON: Uses indirect language/probability longword
6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return number of words that hit in the hash table
6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       const char* isrc, int srclen, Tote* chunk_tote) {
6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int hit_count = 0;
6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* src = isrc;
6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* srclimit = src + srclen + 1;
6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Limit is end+1, to include extra space char (0x20) off the end
6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Score all words truncated to 8 characters
6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int charcount = 0;
6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Skip any initial space
6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (src[0] == ' ') {++src;}
6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* word_ptr = src;
6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* word_end = word_ptr;
6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {
6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "  " );
6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (src < srclimit) {
6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Terminate previous word or continue current word
6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (src[0] == ' ') {
6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      int bytecount = word_end - word_ptr;
6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (bytecount == 0)
6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Lookup and score this word
7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Now go indirect on the subscript
7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      probs = octagram_obj->kCLDTableInd[probs &
7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ~octagram_obj->kCLDTableKeyMask];
7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // // Lookup and score this word
7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //                                kLongWord8TableSize);
7105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      //
7115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (FLAGS_dbglookup) {
7125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
7135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        DbgScoreRecord(NULL, probs, bytecount);
7145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else if (FLAGS_dbgscore && (probs != 0)) {
7155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        DbgScoreRecord(NULL, probs, bytecount);
7165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        string temp(word_ptr, bytecount);
7175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fprintf(stderr, "%s ", temp.c_str());
7185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
7195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (probs != 0) {
7215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ProcessProbV25Tote(probs, chunk_tote);
7225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ++hit_count;
7235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
7245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      charcount = 0;
7255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      word_ptr = src + 1;   // Over the space
7265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      word_end = word_ptr;
7275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
7285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ++charcount;
7295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Advance to next char
7325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    src += cld_UniLib::OneCharLen(src);
7335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (charcount <= 8) {
7345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      word_end = src;
7355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
7375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgscore) {
7395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "[%d words scored]\n", hit_count);
7405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DbgScoreState();
7415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
7425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return hit_count;
7435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
7445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
7485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reliability calculations, for single language and between languages
7495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
7505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return reliablity of result 0..100 for top two scores
7525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
7535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
7545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Threshold is uni/quadgram increment count, bounded above and below.
7555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
7565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Requiring a factor of 3 improvement (e.g. +1 log base 3)
7575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for each scored quadgram is too stringent, so I've backed this off to a
7585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// factor of 2 (e.g. +5/8 log base 3).
7595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
7605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// I also somewhat lowered the Min/MaxGramCount limits above
7615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
7625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
7635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
7645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
7655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int max_reliability_percent = 100;
7665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (gramcount < 8) {
7675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    max_reliability_percent = 12 * gramcount;
7685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
7695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
7705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
7715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fully_reliable_thresh = kMinGramCount;
7725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else if (fully_reliable_thresh > kMaxGramCount) {
7735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fully_reliable_thresh = kMaxGramCount;
7745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
7755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int delta = value1 - value2;
7775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
7785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (delta <= 0) {return 0;}
7795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return cld::minint(max_reliability_percent,
7805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     (100 * delta) / fully_reliable_thresh);
7815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
7825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return reliablity of result 0..100 for top score vs. mainsteam score
7845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Values are score per 1024 bytes of input
7855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ratio = max(top/mainstream, mainstream/top)
7865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
7875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Change: short-text word scoring can give unusually good results.
7885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  Let top exceed mainstream by 4x at 50% reliable
7895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
7905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (mean_score == 0) {return 100;}    // No reliability data available yet
7915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (topscore == 0) {return 0;}        // zero score = unreliable
7925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (len == 0) {return 0;}             // zero len = unreliable
7935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int top_kb = (topscore << 10) / len;
7945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  double ratio;
7955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  double ratio_cutoff;
7965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (top_kb > mean_score) {
7975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ratio = (1.0 * top_kb) / mean_score;
7985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
7995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
8005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ratio = (1.0 * mean_score) / top_kb;
8015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
8025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (ratio <= ratio_cutoff - 2.0) {return 100;}
8045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (ratio > ratio_cutoff) {return 0;}
8055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
8075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return iratio;
8085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
8095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Calculate ratio of score per 1KB vs. expected score per 1KB
8115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
8125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          int bytes, int score) {
8135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Average training-data score for this language-script combo, per 1KB
8145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
8155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (lscript == ULScript_Common) {
8165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We don't know the script (only happens with second-chance score)
8175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Look for first non-zero mean value
8184311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch    for (int i = 2; i >= 0; --i) {
8195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (kMeanScore[lang * 4 + i] > 0) {
8205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        expected_score = kMeanScore[lang * 4 + i];
8214311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch        break;
8225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
8235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
8245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (expected_score < 100) {
8265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      expected_score = 1000;
8275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Our score per 1KB
8305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
8315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  double ratio = our_score / expected_score;
8325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Just the raw count normalized as though each language has mean=1000;
8345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ratio = (score * 1000.0) /  expected_score;
8355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return ratio;
8365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
8375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Calculate reliablity of len bytes of script lscript with chunk_tote
8395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::GetReliability(int len, UnicodeLScript lscript,
8405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   const Tote* chunk_tote) {
8415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
8425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Average score for this language-script combo
8435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
8445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (lscript == ULScript_Common) {
8455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We don't know the script (only happens with second-chance score)
8465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Look for first non-zero mean value
8474311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch    for (int i = 2; i >= 0; --i) {
8485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (kMeanScore[cur_lang * 4 + i] > 0) {
8495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mean_score = kMeanScore[cur_lang * 4 + i];
8504311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch        break;
8515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
8525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
8535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
8555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                           chunk_tote->Value(1),
8565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                           chunk_tote->GetGramCount());
8575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
8595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                               len,
8605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                               mean_score);
8615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int reliability_min = minint(reliability_delta, reliability_main);
8635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (FLAGS_dbgreli) {
8665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char temp1[4];
8675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char temp2[4];
8685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
8695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (temp1[2] == ' ') {temp1[2] = '\0';}
8705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
8715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (temp2[2] == ' ') {temp2[2] = '\0';}
8725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int srclen = len;
8735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
8745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   "lang/lscript %d %d\n",
8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           chunk_tote->GetGramCount(),
8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           chunk_tote->GetIncrCount(),
8785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           srclen,
8795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           temp1, chunk_tote->Value(0),
8805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           temp2, chunk_tote->Value(1),
8815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
8825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           mean_score,
8835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           reliability_delta,
8845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           reliability_main,
8855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           cur_lang, lscript);
8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return reliability_min;
8895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
8905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
8935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Miscellaneous
8945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------
8955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Demote all languages except Top40 and plus_one
8975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Do this just before sorting chunk_tote results
8985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
8995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
9005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (chunk_tote->Key(sub) == 0) continue;
9015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (chunk_tote->Key(sub) == packed_plus_one) continue;
9025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
9035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Quarter the score of others
9045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
9055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
9065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
907