15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2009 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/cldutil.h" 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/cldutil_dbg.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/utf8propletterscriptnum.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_commandlineflags.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_logging.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_unilib.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/win/cld_utf8statetable.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Runtime routines for hashing, looking up, and scoring 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Unigrams and bigrams are for CJK languages only, including simplified/ 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Zhuang Han characters. Surrounding spaces are not considered. 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Quadgrams and octagrams for for non-CJK and include two bits indicating 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// preceding and trailing spaces (word boundaries). 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Indicator bits for leading/trailing space around quad/octagram 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1-, 2-, or 3-bytes each. 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kPreSpaceIndicator = 0x00004444; 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kPostSpaceIndicator = 0x44440000; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Little-endian masks for 0..24 bytes picked up as uint32's 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kWordMask0[4] = { 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMinCJKUTF8CharBytes = 3; 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMinGramCount = 3; 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxGramCount = 16; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Routines to access a hash table of <key:wordhash, value:probs> pairs 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Buckets have 4-byte wordhash for sizes < 32K buckets, but only 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// bucket subscript. 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Probs is a packed: three languages plus a subscript for probability table 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Buckets have all the keys together, then all the values.Key array never 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// crosses a cache-line boundary, so no-match case takes exactly one cache miss. 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Match case may sometimes take an additional cache miss on value access. 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// byte buckets with single cache miss. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Design principles for these hash functions 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Few operations 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Latin script expect 1- and 2-byte mixtures. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Last byte of each character has about 5 bits of information 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Spread good bits around so they can interact in at least two ways 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// with other characters 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// - Use add for additional mixing thorugh carries 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CJK Three-byte bigram 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ....dddd..cccccc..bbbbbb....aaaa 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..................ffffff..eeeeee 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ....dddd..cccccc..bbbbbb....aaaa 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 000....dddd..cccccc..bbbbbb....a 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..................ffffff..eeeeee 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ffffff..eeeeee000000000000000000 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// CJK Four-byte bigram 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..dddddd..cccccc....bbbb....aaaa 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..hhhhhh..gggggg....ffff....eeee 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..dddddd..cccccc....bbbb....aaaa 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 000..dddddd..cccccc....bbbb....a 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..hhhhhh..gggggg....ffff....eeee 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ..ffff....eeee000000000000000000 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BIGRAM 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OVERSHOOTS up to 3 bytes 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 cld::BiHashV25(const char* word_ptr, int bytecount) { 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount == 0) { 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 word0, word1; 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount <= 4) { 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3]; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return word0; 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Else do 8 bytes 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3]; 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 18); 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return word0 + word1; 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Ascii-7 One-byte chars 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...ddddd...ccccc...bbbbb...aaaaa 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...ddddd...ccccc...bbbbb...aaaaa 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 000...ddddd...ccccc...bbbbb...aa 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Latin 1- and 2-byte chars 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...ddddd...ccccc...bbbbb...aaaaa 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...................fffff...eeeee 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...ddddd...ccccc...bbbbb...aaaaa 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 000...ddddd...ccccc...bbbbb...aa 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...................fffff...eeeee 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...............fffff...eeeee0000 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Non-CJK Two-byte chars 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...ddddd...........bbbbb........ 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...hhhhh...........fffff........ 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...ddddd...........bbbbb........ 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 000...ddddd...........bbbbb..... 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...hhhhh...........fffff........ 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// hhhh...........fffff........0000 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Non-CJK Three-byte chars 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...........ccccc................ 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...................fffff........ 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...lllll...................iiiii 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...........ccccc................ 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 000...........ccccc............. 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...................fffff........ 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...............fffff........0000 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ...lllll...................iiiii 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// .lllll...................iiiii00 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OVERSHOOTS up to 3 bytes 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) { 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 word0, word1, word2; 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount <= 4) { 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3]; 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return word0 ^ prepost; 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (bytecount <= 8) { 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3]; 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (word0 ^ prepost) + word1; 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // else do 12 bytes 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4); 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3]; 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word2 = word2 ^ (word2 << 2); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (word0 ^ prepost) + word1 + word2; 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM wrapper with surrounding spaces 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) { 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount == 0) { 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 prepost = 0; 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return QuadHashV25Mix(word_ptr, bytecount, prepost); 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM wrapper with surrounding underscores (offline use) 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OVERSHOOTS up to 3 bytes 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For offline construction of tables 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) { 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount == 0) { 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* local_word_ptr = word_ptr; 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int local_bytecount = bytecount; 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 prepost = 0; 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (local_word_ptr[0] == '_') { 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prepost |= kPreSpaceIndicator; 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++local_word_ptr; 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --local_bytecount; 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (local_word_ptr[local_bytecount - 1] == '_') { 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prepost |= kPostSpaceIndicator; 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --local_bytecount; 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost); 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The low 32 bits follow the pattern from above, tuned to different scripts 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables V3 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 word0; 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 word1; 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 sum; 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch ((bytecount - 1) >> 2) { 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 0: // 1..4 bytes 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3]; 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = word0; 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 1: // 5..8 bytes 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = word0; 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3]; 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 2: // 9..12 bytes 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = word0; 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4); 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3]; 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 2); 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 3: // 13..16 bytes 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = word0; 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4); 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 8); 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 2); 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3]; 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 >> 8); 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case 4: // 17..20 bytes 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(word_ptr); 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = word0; 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4); 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 8); 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 2); 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 12); 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 >> 8); 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3]; 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 >> 4); 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: // 21..24 bytes and higher (ignores beyond 24) 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = UnalignedLoad32(&word_ptr); 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = word0; 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 = word0 ^ (word0 >> 3); 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 4); 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 4); 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 8); 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 << 2); 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 12); 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 >> 8); 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 16); 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 >> 4); 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3]; 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += word1; 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word1 = word1 ^ (word1 >> 6); 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word0 += word1; 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sum = (sum & 0xff) << 32; 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (word0 ^ prepost) + sum; 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM wrapper with surrounding spaces 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The low 32 bits follow the pattern from above, tuned to different scripts 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For runtime use of tables V3 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint64 cld::OctaHash40(const char* word_ptr, int bytecount) { 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount == 0) { 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 prepost = 0; 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return OctaHash40Mix(word_ptr, bytecount, prepost); 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM wrapper with surrounding underscores (offline use) 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The low 32 bits follow the pattern from above, tuned to different scripts 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For offline construction of tables 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) { 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount == 0) { 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* local_word_ptr = word_ptr; 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int local_bytecount = bytecount; 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 prepost = 0; 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (local_word_ptr[0] == '_') { 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prepost |= kPreSpaceIndicator; 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++local_word_ptr; 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --local_bytecount; 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (local_word_ptr[local_bytecount - 1] == '_') { 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prepost |= kPostSpaceIndicator; 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --local_bytecount; 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scoring single groups of letters 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNIGRAM score one => tote 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Input: 1-byte entry of subscript into unigram probs, plus 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// an accumulator tote. 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Output: running sums in tote updated 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void cld::ProcessProbV25UniTote(int propval, Tote* tote) { 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tote->AddGram(); 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval]; 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);} 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);} 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);} 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);} 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);} 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);} 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BIGRAM, QUADGRAM, OCTAGRAM score one => tote 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Input: 4-byte entry of 3 language numbers and one probability subscript, plus 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// an accumulator tote. (language 0 means unused entry) 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Output: running sums in tote updated 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) { 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) tote->AddGram(); 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8 prob123 = (probs >> 0) & 0xff; 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* prob123_entry = cld::LgProb2TblEntry(prob123); 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8 top1 = (probs >> 8) & 0xff; 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));} 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8 top2 = (probs >> 16) & 0xff; 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));} 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8 top3 = (probs >> 24) & 0xff; 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));} 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Routines to accumulate probabilities 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score up to n unigrams, returning number of bytes consumed 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Updates tote_grams 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj, 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* isrc, int srclen, int advance_by, 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* tote_grams, int gram_limit, Tote* chunk_tote) { 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* src = isrc; 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);} 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Property-based CJK unigram lookup 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src[0] == ' ') {++src; --srclen;} 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* usrc = reinterpret_cast<const uint8*>(src); 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int usrclen = srclen; 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (usrclen > 0) { 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len = kAdvanceOneChar[usrc[0]]; 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look up property of one UTF-8 character and advance over it 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Return 0 if input length is zero 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Return 0 and advance one byte if input is ill-formed 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen); 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbglookup) { 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgUniTermToStderr(propval, usrc, len); 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (propval > 0) { 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ProcessProbV25UniTote(propval, chunk_tote); 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++(*tote_grams); 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);} 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 1/2/4/8 characters (half of quad advance) 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (advance_by == 2) { 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Already advanced by 1 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (advance_by == 4) { 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 2 chars total, if not at end 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (UTFmax <= usrclen) { 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (advance_by == 8) { 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 4 chars total, if not at end 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((UTFmax * 3) <= usrclen) { 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 8 chars total, if not at end 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((UTFmax * 7) <= usrclen) { 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(usrclen >= 0); 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (*tote_grams >= gram_limit) { 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) { 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // With advance_by>2, we consume more input to get the same number of quads 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len = src - isrc; 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreTop(src, (len * 2) / advance_by, chunk_tote); 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreFlush(); 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int consumed2 = reinterpret_cast<const char*>(usrc) - isrc; 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return consumed2; 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// BIGRAM, using hash table, always advancing by 1 char 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj 5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score all bigrams in isrc, using languages that have bigrams (CJK) 5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return number of bigrams that hit in the hash table 5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj, 5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* isrc, int srclen, Tote* chunk_tote) { 5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int hit_count = 0; 5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* src = isrc; 5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Hashtable-based CJK bigram lookup 5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* usrc = reinterpret_cast<const uint8*>(src); 5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* usrclimit1 = usrc + srclen - UTFmax; 5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) { 5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, " " ); 5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (usrc < usrclimit1) { 5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len = kAdvanceOneChar[usrc[0]]; 5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len2 = kAdvanceOneChar[usrc[len]] + len; 5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible 5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Lookup and score this bigram 5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Always ignore pre/post spaces 5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2); 5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); 5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now go indirect on the subscript 5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) probs = bigram_obj->kCLDTableInd[probs & 5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~bigram_obj->kCLDTableKeyMask]; 5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Process the bigram 5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbglookup) { 5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* ssrc = reinterpret_cast<const char*>(usrc); 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgBiTermToStderr(bihash, probs, ssrc, len2); 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreRecord(NULL, probs, len2); 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (FLAGS_dbgscore && (probs != 0)) { 5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* ssrc = reinterpret_cast<const char*>(usrc); 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreRecord(NULL, probs, len2); 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string temp(ssrc, len2); 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "%s ", temp.c_str()); 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (probs != 0) { 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ProcessProbV25Tote(probs, chunk_tote); 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++hit_count; 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) usrc += len; // Advance by one char 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) { 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "[%d bigrams scored]\n", hit_count); 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreState(); 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return hit_count; 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// QUADGRAM, using hash table, advancing by 2/4/8/16 chars 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score up to n quadgrams, returning number of bytes consumed 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Updates tote_grams 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* isrc, int srclen, int advance_by, 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* tote_grams, int gram_limit, Tote* chunk_tote) { 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* src = isrc; 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* srclimit = src + srclen; 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Limit is end, which has extra 20 20 20 00 past len 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* srclimit7 = src + srclen - (UTFmax * 7); 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* srclimit15 = src + srclen - (UTFmax * 15); 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);} 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Run a little cache of last hits to catch overly-repetitive "text" 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next_prior = 0; 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 prior_quads[2] = {0, 0}; 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Visit all quadgrams 5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src[0] == ' ') {++src;} 5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (src < srclimit) { 5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Find one quadgram 5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* src_end = src; 5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* src_mid = src_end; 5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len = src_end - src; 5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Lookup and score this quadgram 5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 quadhash = QuadHashV25(src, len); 5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); 5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now go indirect on the subscript 5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) probs = quadgram_obj->kCLDTableInd[probs & 5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~quadgram_obj->kCLDTableKeyMask]; 5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Process the quadgram 5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbglookup) { 5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgQuadTermToStderr(quadhash, probs, src, len); 5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (probs != 0) { 6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Filter out recent repeats. If this works out, use in the other lookups 6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) { 6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prior_quads[next_prior] = quadhash; 6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next_prior = (next_prior + 1) & 1; 6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ProcessProbV25Tote(probs, chunk_tote); 6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++(*tote_grams); 6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);} 6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance all the way past word if at end-of-word 6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src_end[0] == ' ') { 6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_mid = src_end; 6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 2/4/8/16 characters 6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (advance_by == 2) { 6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src = src_mid; 6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (advance_by == 4) { 6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src = src_end; 6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (advance_by == 8) { 6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 8 chars total (4 more), if not at end 6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src < srclimit7) { 6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneChar[(uint8)src_end[0]]; 6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneChar[(uint8)src_end[0]]; 6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneChar[(uint8)src_end[0]]; 6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src_end += kAdvanceOneChar[(uint8)src_end[0]]; 6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src = src_end; 6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by 16 chars total (12 more), if not at end 6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src < srclimit15) { 6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance by ~16 chars by adding 3 * current bytelen 6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int fourcharlen = src_end - src; 6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src = src_end + (3 * fourcharlen); 6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance a bit more if mid-character 6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src = src_end; 6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(src < srclimit); 6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (*tote_grams >= gram_limit) { 6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) { 6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // With advance_by>2, we consume more input to get the same number of quads 6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len = src - isrc; 6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreTop(src, (len * 2) / advance_by, chunk_tote); 6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreFlush(); 6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int consumed = src - isrc; 6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If advancing by more than 2, src may have overshot srclimit 6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (consumed > srclen) { 6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) consumed = srclen; 6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return consumed; 6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OCTAGRAM, using hash table, always advancing by 1 word 6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller supplies table, such as &kLongWord8Table_obj 6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Score all words in isrc, using languages that have quadgrams 6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// We don't normally use this routine except on the first quadgram run, 6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but it can be used to resolve unreliable pages. 6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This routine does not have an optimized advance_by 6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SOON: Uses indirect language/probability longword 6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return number of words that hit in the hash table 6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj, 6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* isrc, int srclen, Tote* chunk_tote) { 6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int hit_count = 0; 6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* src = isrc; 6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* srclimit = src + srclen + 1; 6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Limit is end+1, to include extra space char (0x20) off the end 6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Score all words truncated to 8 characters 6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int charcount = 0; 6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Skip any initial space 6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src[0] == ' ') {++src;} 6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* word_ptr = src; 6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* word_end = word_ptr; 6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) { 6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, " " ); 6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (src < srclimit) { 6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Terminate previous word or continue current word 6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (src[0] == ' ') { 6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int bytecount = word_end - word_ptr; 6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (bytecount == 0) 6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Lookup and score this word 7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 wordhash40 = OctaHash40(word_ptr, bytecount); 7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40); 7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now go indirect on the subscript 7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) probs = octagram_obj->kCLDTableInd[probs & 7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~octagram_obj->kCLDTableKeyMask]; 7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // // Lookup and score this word 7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // uint32 wordhash = QuadHashV25(word_ptr, bytecount); 7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table, 7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kLongWord8TableSize); 7105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 7115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbglookup) { 7125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount); 7135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreRecord(NULL, probs, bytecount); 7145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (FLAGS_dbgscore && (probs != 0)) { 7155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreRecord(NULL, probs, bytecount); 7165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string temp(word_ptr, bytecount); 7175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "%s ", temp.c_str()); 7185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (probs != 0) { 7215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ProcessProbV25Tote(probs, chunk_tote); 7225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++hit_count; 7235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) charcount = 0; 7255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_ptr = src + 1; // Over the space 7265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_end = word_ptr; 7275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 7285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++charcount; 7295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance to next char 7325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src += cld_UniLib::OneCharLen(src); 7335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (charcount <= 8) { 7345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_end = src; 7355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgscore) { 7395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "[%d words scored]\n", hit_count); 7405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DbgScoreState(); 7415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return hit_count; 7435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 7485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reliability calculations, for single language and between languages 7495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 7505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return reliablity of result 0..100 for top two scores 7525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable 7535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) 7545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Threshold is uni/quadgram increment count, bounded above and below. 7555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 7565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Requiring a factor of 3 improvement (e.g. +1 log base 3) 7575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for each scored quadgram is too stringent, so I've backed this off to a 7585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// factor of 2 (e.g. +5/8 log base 3). 7595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 7605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// I also somewhat lowered the Min/MaxGramCount limits above 7615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 7625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Added: if fewer than 8 quads/unis, max reliability is 12*n percent 7635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 7645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::ReliabilityDelta(int value1, int value2, int gramcount) { 7655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int max_reliability_percent = 100; 7665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (gramcount < 8) { 7675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_reliability_percent = 12 * gramcount; 7685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above 7705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 7715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fully_reliable_thresh = kMinGramCount; 7725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (fully_reliable_thresh > kMaxGramCount) { 7735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fully_reliable_thresh = kMaxGramCount; 7745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int delta = value1 - value2; 7775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (delta >= fully_reliable_thresh) {return max_reliability_percent;} 7785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (delta <= 0) {return 0;} 7795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return cld::minint(max_reliability_percent, 7805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (100 * delta) / fully_reliable_thresh); 7815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return reliablity of result 0..100 for top score vs. mainsteam score 7845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Values are score per 1024 bytes of input 7855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ratio = max(top/mainstream, mainstream/top) 7865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable 7875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Change: short-text word scoring can give unusually good results. 7885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Let top exceed mainstream by 4x at 50% reliable 7895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::ReliabilityMainstream(int topscore, int len, int mean_score) { 7905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (mean_score == 0) {return 100;} // No reliability data available yet 7915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (topscore == 0) {return 0;} // zero score = unreliable 7925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (len == 0) {return 0;} // zero len = unreliable 7935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int top_kb = (topscore << 10) / len; 7945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double ratio; 7955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double ratio_cutoff; 7965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (top_kb > mean_score) { 7975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ratio = (1.0 * top_kb) / mean_score; 7985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0 7995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 8005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ratio = (1.0 * mean_score) / top_kb; 8015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0 8025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ratio <= ratio_cutoff - 2.0) {return 100;} 8045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ratio > ratio_cutoff) {return 0;} 8055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0); 8075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return iratio; 8085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Calculate ratio of score per 1KB vs. expected score per 1KB 8115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript, 8125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int bytes, int score) { 8135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Average training-data score for this language-script combo, per 1KB 8145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int expected_score = kMeanScore[lang * 4 + LScript4(lscript)]; 8155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lscript == ULScript_Common) { 8165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We don't know the script (only happens with second-chance score) 8175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for first non-zero mean value 8184311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch for (int i = 2; i >= 0; --i) { 8195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kMeanScore[lang * 4 + i] > 0) { 8205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_score = kMeanScore[lang * 4 + i]; 8214311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch break; 8225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (expected_score < 100) { 8265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) expected_score = 1000; 8275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Our score per 1KB 8305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv 8315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double ratio = our_score / expected_score; 8325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Just the raw count normalized as though each language has mean=1000; 8345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ratio = (score * 1000.0) / expected_score; 8355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ratio; 8365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Calculate reliablity of len bytes of script lscript with chunk_tote 8395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int cld::GetReliability(int len, UnicodeLScript lscript, 8405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Tote* chunk_tote) { 8415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Language cur_lang = UnpackLanguage(chunk_tote->Key(0)); 8425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Average score for this language-script combo 8435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)]; 8445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lscript == ULScript_Common) { 8455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We don't know the script (only happens with second-chance score) 8465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for first non-zero mean value 8474311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch for (int i = 2; i >= 0; --i) { 8485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kMeanScore[cur_lang * 4 + i] > 0) { 8495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mean_score = kMeanScore[cur_lang * 4 + i]; 8504311e82a78ceafbe0585f51d4c8a86df9f21aa0dBen Murdoch break; 8515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int reliability_delta = ReliabilityDelta(chunk_tote->Value(0), 8555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_tote->Value(1), 8565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_tote->GetGramCount()); 8575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int reliability_main = ReliabilityMainstream(chunk_tote->Value(0), 8595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) len, 8605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mean_score); 8615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int reliability_min = minint(reliability_delta, reliability_main); 8635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_dbgreli) { 8665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char temp1[4]; 8675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char temp2[4]; 8685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1); 8695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (temp1[2] == ' ') {temp1[2] = '\0';} 8705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2); 8715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (temp2[2] == ' ') {temp2[2] = '\0';} 8725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int srclen = len; 8735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d " 8745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "top/KB=%d mean/KB=%d del=%d%% reli=%d%% " 8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "lang/lscript %d %d\n", 8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_tote->GetGramCount(), 8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_tote->GetIncrCount(), 8785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) srclen, 8795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) temp1, chunk_tote->Value(0), 8805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) temp2, chunk_tote->Value(1), 8815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1), 8825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mean_score, 8835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reliability_delta, 8845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reliability_main, 8855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cur_lang, lscript); 8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return reliability_min; 8895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 8935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Miscellaneous 8945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 8955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Demote all languages except Top40 and plus_one 8975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Do this just before sorting chunk_tote results 8985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) { 8995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) { 9005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (chunk_tote->Key(sub) == 0) continue; 9015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (chunk_tote->Key(sub) == packed_plus_one) continue; 9025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kIsPackedTop40[chunk_tote->Key(sub)]) continue; 9035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Quarter the score of others 9045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2); 9055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 907