11059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard/* 21059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * Copyright (C) 2011 The Android Open Source Project 31059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * 41059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * Licensed under the Apache License, Version 2.0 (the "License"); 51059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * you may not use this file except in compliance with the License. 61059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * You may obtain a copy of the License at 71059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * 81059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * http://www.apache.org/licenses/LICENSE-2.0 91059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * 101059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * Unless required by applicable law or agreed to in writing, software 111059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * distributed under the License is distributed on an "AS IS" BASIS, 121059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 131059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * See the License for the specific language governing permissions and 141059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * limitations under the License. 151059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard */ 161059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 171059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard#ifndef LATINIME_BINARY_FORMAT_H 181059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard#define LATINIME_BINARY_FORMAT_H 191059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard#include <limits> 2149ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard#include "bloom_filter.h" 22f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard#include "unigram_dictionary.h" 23f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard 241059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardnamespace latinime { 251059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 261059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardclass BinaryFormat { 27e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa private: 281059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 291059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; 301059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; 311059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 32e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa public: 33f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard const static int UNKNOWN_FORMAT = -1; 3446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Originally, format version 1 had a 16-bit magic number, then the version number `01' 3546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // then options that must be 0. Hence the first 32-bits of the format are always as follow 3646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // and it's okay to consider them a magic number as a whole. 3746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; 3846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; 3946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // The versions of Latin IME that only handle format version 1 only test for the magic 4046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // number, so we had to change it so that version 2 files would be rejected by older 4146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // implementations. On this occasion, we made the magic number 32 bits long. 4246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 43f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard 449a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; 459a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard const static int SHORTCUT_LIST_SIZE_SIZE = 2; 469a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard 47f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard static int detectFormat(const uint8_t* const dict); 4846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard static unsigned int getHeaderSize(const uint8_t* const dict); 49e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard static unsigned int getFlags(const uint8_t* const dict); 501059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); 511059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); 521059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); 531059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); 541059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int skipOtherCharacters(const uint8_t* const dict, const int pos); 551059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int skipChildrenPosition(const uint8_t flags, const int pos); 561059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int skipFrequency(const uint8_t flags, const int pos); 579a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos); 589a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos); 591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); 601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, 611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int pos); 621059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); 631059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static bool hasChildrenInFlags(const uint8_t flags); 641059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, 651059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int *pos); 66522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, 676a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard const int length); 68588e2f296451a8eb074af9140d018b828105237fJean Chalard static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, 69e308459531a4dd64ee80aa76e351725180ad856eJean Chalard uint16_t* outWord, int* outUnigramFrequency); 7019ebd936462ee6e4796b8755be82d67437406845Jean Chalard static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); 7149ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard static int getProbability(const int position, const std::map<int, int> *bigramMap, 7249ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard const uint8_t *bigramFilter, const int unigramFreq); 73e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard 74e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard // Flags for special processing 75e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or 76e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard // something very bad (like, the apocalypse) will happen. Please update both at the same time. 77e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard enum { 78e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, 79e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 80e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard }; 81e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard const static unsigned int NO_FLAGS = 0; 821059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}; 831059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 84f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalardinline int BinaryFormat::detectFormat(const uint8_t* const dict) { 8546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // The magic number is stored big-endian. 8646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; 8746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard switch (magicNumber) { 8846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard case FORMAT_VERSION_1_MAGIC_NUMBER: 8946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Format 1 header is exactly 5 bytes long and looks like: 9046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Magic number (2 bytes) 0x78 0xB1 9146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Version number (1 byte) 0x01 9246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Options (2 bytes) must be 0x00 0x00 9346a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard return 1; 9446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard case FORMAT_VERSION_2_MAGIC_NUMBER: 9546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Format 2 header is as follows: 9646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE 9746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Version number (2 bytes) 0x00 0x02 985b0761e6a94227d6ef788f589fb6edcd44ed791fJean Chalard // Options (2 bytes) 9946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // Header size (4 bytes) : integer, big endian 10046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard return (dict[4] << 8) + dict[5]; 10146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard default: 10246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard return UNKNOWN_FORMAT; 10346a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard } 10446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard} 10546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard 106e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalardinline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { 107e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard switch (detectFormat(dict)) { 108e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard case 1: 109e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard return NO_FLAGS; 110e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard default: 111e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard return (dict[6] << 8) + dict[7]; 112e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard } 113e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard} 114e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard 11546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalardinline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { 11646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard switch (detectFormat(dict)) { 11746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard case 1: 11846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard return FORMAT_VERSION_1_HEADER_SIZE; 11946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard case 2: 12046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard // See the format of the header in the comment in detectFormat() above 12146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; 12246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard default: 12346a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard return std::numeric_limits<unsigned int>::max(); 12446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard } 125f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard} 126f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard 1271059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { 1284c0eca6e416485be61d7fddcad1e1552444daf85Jean Chalard const int msb = dict[(*pos)++]; 1294c0eca6e416485be61d7fddcad1e1552444daf85Jean Chalard if (msb < 0x80) return msb; 1304c0eca6e416485be61d7fddcad1e1552444daf85Jean Chalard return ((msb & 0x7F) << 8) | dict[(*pos)++]; 1311059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1321059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1331059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { 1341059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return dict[(*pos)++]; 1351059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1361059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1371059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { 1381059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int origin = *pos; 1391059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int32_t character = dict[origin]; 1401059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 1411059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard if (character == CHARACTER_ARRAY_TERMINATOR) { 1421059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *pos = origin + 1; 1431059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return NOT_A_CHARACTER; 1441059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } else { 1451059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *pos = origin + 3; 1461059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int32_t char_1 = character << 16; 1471059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int32_t char_2 = char_1 + (dict[origin + 1] << 8); 1481059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return char_2 + dict[origin + 2]; 1491059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 1501059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } else { 1511059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *pos = origin + 1; 1521059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return character; 1531059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 1541059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1551059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1561059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, 1571059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int pos) { 1581059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return dict[pos]; 1591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { 1621059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int currentPos = pos; 1631059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int32_t character = dict[currentPos++]; 1641059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard while (CHARACTER_ARRAY_TERMINATOR != character) { 1651059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 1661059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; 1671059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 1681059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard character = dict[currentPos++]; 1691059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 1701059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return currentPos; 1711059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1721059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1731059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardstatic inline int attributeAddressSize(const uint8_t flags) { 1741059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static const int ATTRIBUTE_ADDRESS_SHIFT = 4; 1751059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; 1761059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard /* Note: this is a value-dependant optimization of what may probably be 1771059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard more readably written this way: 1781059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { 1791059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; 1801059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; 1811059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; 1821059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard default: return 0; 1831059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 1841059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard */ 1851059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1861059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1879a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardstatic inline int skipExistingBigrams(const uint8_t* const dict, const int pos) { 1881059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int currentPos = pos; 1899a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); 1901059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { 1911059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard currentPos += attributeAddressSize(flags); 1929a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); 1931059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 1941059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard currentPos += attributeAddressSize(flags); 1951059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return currentPos; 1961059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 1971059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 1981059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardstatic inline int childrenAddressSize(const uint8_t flags) { 1991059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard static const int CHILDREN_ADDRESS_SHIFT = 6; 2001059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; 2011059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard /* See the note in attributeAddressSize. The same applies here */ 2021059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2031059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2049a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardstatic inline int shortcutByteSize(const uint8_t* const dict, const int pos) { 2059a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard return ((int)(dict[pos] << 8)) + (dict[pos + 1]); 2069a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard} 2079a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard 2081059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { 2091059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return pos + childrenAddressSize(flags); 2101059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2111059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2121059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { 2131059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; 2141059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2151059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2169a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardinline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags, 2171059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int pos) { 218e0e339699addbdc837b79c110a8432f0641d16eeJean Chalard if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { 2199a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard return pos + shortcutByteSize(dict, pos); 2209a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard } else { 2219a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard return pos; 222e0e339699addbdc837b79c110a8432f0641d16eeJean Chalard } 2239a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard} 2249a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard 2259a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardinline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags, 2269a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard const int pos) { 2271059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { 2289a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard return skipExistingBigrams(dict, pos); 2299a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard } else { 2309a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard return pos; 2311059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 2329a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard} 2339a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard 2349a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardinline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, 2359a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard const int pos) { 2369a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard // This function skips all attributes: shortcuts and bigrams. 2379a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard int newPos = pos; 2389a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard newPos = skipShortcuts(dict, flags, newPos); 2399a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard newPos = skipBigrams(dict, flags, newPos); 240e0e339699addbdc837b79c110a8432f0641d16eeJean Chalard return newPos; 2411059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2421059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2431059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, 2441059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const uint8_t flags, const int pos) { 2451059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int currentPos = pos; 2461059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard currentPos = skipChildrenPosition(flags, currentPos); 2471059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard currentPos = skipAllAttributes(dict, flags, currentPos); 2481059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return currentPos; 2491059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2501059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2511059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, 2521059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int pos) { 2531059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int offset = 0; 2541059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { 2551059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 2561059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset = dict[pos]; 2571059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard break; 2581059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 2591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset = dict[pos] << 8; 2601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset += dict[pos + 1]; 2611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard break; 2621059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 2631059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset = dict[pos] << 16; 2641059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset += dict[pos + 1] << 8; 2651059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset += dict[pos + 2]; 2661059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard break; 2671059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard default: 2681059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard // If we come here, it means we asked for the children of a word with 2691059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard // no children. 2701059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return -1; 2711059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 2721059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return pos + offset; 2731059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2741059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2751059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { 2761059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 2771059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); 2781059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 2791059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 2801059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, 2811059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const uint8_t flags, int *pos) { 2821059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard int offset = 0; 2831059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard const int origin = *pos; 2841059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { 2851059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 2861059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset = dict[origin]; 2871059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *pos = origin + 1; 2881059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard break; 2891059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 2901059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset = dict[origin] << 8; 2911059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset += dict[origin + 1]; 2921059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *pos = origin + 2; 2931059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard break; 2941059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 2951059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset = dict[origin] << 16; 2961059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset += dict[origin + 1] << 8; 2971059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard offset += dict[origin + 2]; 2981059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *pos = origin + 3; 2991059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard break; 3001059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 3011059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { 3021059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return origin - offset; 3031059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } else { 3041059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard return origin + offset; 3051059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard } 3061059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} 3071059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 3086a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard// This function gets the byte position of the last chargroup of the exact matching word in the 3096a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard// dictionary. If no match is found, it returns NOT_VALID_WORD. 3106a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalardinline int BinaryFormat::getTerminalPosition(const uint8_t* const root, 311522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard const int32_t* const inWord, const int length) { 3126a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard int pos = 0; 3136a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard int wordPos = 0; 3146a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard 3156a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard while (true) { 3166a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // If we already traversed the tree further than the word is long, there means 3176a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // there was no match (or we would have found it). 3186a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (wordPos > length) return NOT_VALID_WORD; 3196a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); 320522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard const int32_t wChar = inWord[wordPos]; 3216a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard while (true) { 3226a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // If there are no more character groups in this node, it means we could not 3236a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // find a matching character for this depth, therefore there is no match. 3246a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (0 >= charGroupCount) return NOT_VALID_WORD; 3256a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard const int charGroupPos = pos; 3266a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 3276a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 3286a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (character == wChar) { 3296a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // This is the correct node. Only one character group may start with the same 3306a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // char within a node, so either we found our match in this node, or there is 3316a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // no match and we can return NOT_VALID_WORD. So we will check all the characters 3326a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // in this character group indeed does match. 3336a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 3346a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 3356a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard while (NOT_A_CHARACTER != character) { 3366a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard ++wordPos; 3376a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // If we shoot the length of the word we search for, or if we find a single 3386a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // character that does not match, as explained above, it means the word is 3396a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // not in the dictionary (by virtue of this chargroup being the only one to 3406a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // match the word on the first character, but not matching the whole word). 3416a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (wordPos > length) return NOT_VALID_WORD; 3426a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (inWord[wordPos] != character) return NOT_VALID_WORD; 3436a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 3446a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3456a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3466a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // If we come here we know that so far, we do match. Either we are on a terminal 3476a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // and we match the length, in which case we found it, or we traverse children. 3486a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // If we don't match the length AND don't have children, then a word in the 3496a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // dictionary fully matches a prefix of the searched word but not the full word. 3506a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard ++wordPos; 3516a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { 3526a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (wordPos == length) { 3536a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard return charGroupPos; 3546a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3556a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); 3566a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3576a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 3586a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { 3596a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard return NOT_VALID_WORD; 3606a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3616a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // We have children and we are still shorter than the word we are searching for, so 3626a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // we need to traverse children. Put the pointer on the children position, and 3636a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // break 3646a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard pos = BinaryFormat::readChildrenPosition(root, flags, pos); 3656a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard break; 3666a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } else { 3676a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard // This chargroup does not match, so skip the remaining part and go to the next. 3686a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 3696a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard pos = BinaryFormat::skipOtherCharacters(root, pos); 3706a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3716a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard pos = BinaryFormat::skipFrequency(flags, pos); 3726a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); 3736a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3746a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard --charGroupCount; 3756a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3766a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard } 3776a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard} 3786a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard 379588e2f296451a8eb074af9140d018b828105237fJean Chalard// This function searches for a terminal in the dictionary by its address. 380588e2f296451a8eb074af9140d018b828105237fJean Chalard// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, 381588e2f296451a8eb074af9140d018b828105237fJean Chalard// it is possible to check for this with advantageous complexity. For each node, we search 382588e2f296451a8eb074af9140d018b828105237fJean Chalard// for groups with children and compare the children address with the address we look for. 383588e2f296451a8eb074af9140d018b828105237fJean Chalard// When we shoot the address we look for, it means the word we look for is in the children 384588e2f296451a8eb074af9140d018b828105237fJean Chalard// of the previous group. The only tricky part is the fact that if we arrive at the end of a 385588e2f296451a8eb074af9140d018b828105237fJean Chalard// node with the last group's children address still less than what we are searching for, we 386588e2f296451a8eb074af9140d018b828105237fJean Chalard// must descend the last group's children (for example, if the word we are searching for starts 387588e2f296451a8eb074af9140d018b828105237fJean Chalard// with a z, it's the last group of the root node, so all children addresses will be smaller 388588e2f296451a8eb074af9140d018b828105237fJean Chalard// than the address we look for, and we have to descend the z node). 389588e2f296451a8eb074af9140d018b828105237fJean Chalard/* Parameters : 390588e2f296451a8eb074af9140d018b828105237fJean Chalard * root: the dictionary buffer 391588e2f296451a8eb074af9140d018b828105237fJean Chalard * address: the byte position of the last chargroup of the word we are searching for (this is 392588e2f296451a8eb074af9140d018b828105237fJean Chalard * what is stored as the "bigram address" in each bigram) 393588e2f296451a8eb074af9140d018b828105237fJean Chalard * outword: an array to write the found word, with MAX_WORD_LENGTH size. 394e308459531a4dd64ee80aa76e351725180ad856eJean Chalard * outUnigramFrequency: a pointer to an int to write the frequency into. 395588e2f296451a8eb074af9140d018b828105237fJean Chalard * Return value : the length of the word, of 0 if the word was not found. 396588e2f296451a8eb074af9140d018b828105237fJean Chalard */ 397588e2f296451a8eb074af9140d018b828105237fJean Chalardinline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, 398e308459531a4dd64ee80aa76e351725180ad856eJean Chalard const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { 399588e2f296451a8eb074af9140d018b828105237fJean Chalard int pos = 0; 400588e2f296451a8eb074af9140d018b828105237fJean Chalard int wordPos = 0; 401588e2f296451a8eb074af9140d018b828105237fJean Chalard 402588e2f296451a8eb074af9140d018b828105237fJean Chalard // One iteration of the outer loop iterates through nodes. As stated above, we will only 403588e2f296451a8eb074af9140d018b828105237fJean Chalard // traverse nodes that are actually a part of the terminal we are searching, so each time 404588e2f296451a8eb074af9140d018b828105237fJean Chalard // we enter this loop we are one depth level further than last time. 405588e2f296451a8eb074af9140d018b828105237fJean Chalard // The only reason we count nodes is because we want to reduce the probability of infinite 406588e2f296451a8eb074af9140d018b828105237fJean Chalard // looping in case there is a bug. Since we know there is an upper bound to the depth we are 407588e2f296451a8eb074af9140d018b828105237fJean Chalard // supposed to traverse, it does not hurt to count iterations. 408588e2f296451a8eb074af9140d018b828105237fJean Chalard for (int loopCount = maxDepth; loopCount > 0; --loopCount) { 409588e2f296451a8eb074af9140d018b828105237fJean Chalard int lastCandidateGroupPos = 0; 410588e2f296451a8eb074af9140d018b828105237fJean Chalard // Let's loop through char groups in this node searching for either the terminal 411588e2f296451a8eb074af9140d018b828105237fJean Chalard // or one of its ascendants. 412588e2f296451a8eb074af9140d018b828105237fJean Chalard for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; 413588e2f296451a8eb074af9140d018b828105237fJean Chalard --charGroupCount) { 414588e2f296451a8eb074af9140d018b828105237fJean Chalard const int startPos = pos; 415588e2f296451a8eb074af9140d018b828105237fJean Chalard const uint8_t flags = getFlagsAndForwardPointer(root, &pos); 416588e2f296451a8eb074af9140d018b828105237fJean Chalard const int32_t character = getCharCodeAndForwardPointer(root, &pos); 417588e2f296451a8eb074af9140d018b828105237fJean Chalard if (address == startPos) { 418588e2f296451a8eb074af9140d018b828105237fJean Chalard // We found the address. Copy the rest of the word in the buffer and return 419588e2f296451a8eb074af9140d018b828105237fJean Chalard // the length. 420588e2f296451a8eb074af9140d018b828105237fJean Chalard outWord[wordPos] = character; 421588e2f296451a8eb074af9140d018b828105237fJean Chalard if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 422588e2f296451a8eb074af9140d018b828105237fJean Chalard int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); 423588e2f296451a8eb074af9140d018b828105237fJean Chalard // We count chars in order to avoid infinite loops if the file is broken or 424588e2f296451a8eb074af9140d018b828105237fJean Chalard // if there is some other bug 425588e2f296451a8eb074af9140d018b828105237fJean Chalard int charCount = maxDepth; 426402b0570505c7ea1389e1c153e5db0300568ce26Jean Chalard while (NOT_A_CHARACTER != nextChar && --charCount > 0) { 427588e2f296451a8eb074af9140d018b828105237fJean Chalard outWord[++wordPos] = nextChar; 428588e2f296451a8eb074af9140d018b828105237fJean Chalard nextChar = getCharCodeAndForwardPointer(root, &pos); 429588e2f296451a8eb074af9140d018b828105237fJean Chalard } 430588e2f296451a8eb074af9140d018b828105237fJean Chalard } 431e308459531a4dd64ee80aa76e351725180ad856eJean Chalard *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); 432588e2f296451a8eb074af9140d018b828105237fJean Chalard return ++wordPos; 433588e2f296451a8eb074af9140d018b828105237fJean Chalard } 434588e2f296451a8eb074af9140d018b828105237fJean Chalard // We need to skip past this char group, so skip any remaining chars after the 435588e2f296451a8eb074af9140d018b828105237fJean Chalard // first and possibly the frequency. 436588e2f296451a8eb074af9140d018b828105237fJean Chalard if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 437588e2f296451a8eb074af9140d018b828105237fJean Chalard pos = skipOtherCharacters(root, pos); 438588e2f296451a8eb074af9140d018b828105237fJean Chalard } 439588e2f296451a8eb074af9140d018b828105237fJean Chalard pos = skipFrequency(flags, pos); 440588e2f296451a8eb074af9140d018b828105237fJean Chalard 441588e2f296451a8eb074af9140d018b828105237fJean Chalard // The fact that this group has children is very important. Since we already know 442588e2f296451a8eb074af9140d018b828105237fJean Chalard // that this group does not match, if it has no children we know it is irrelevant 443588e2f296451a8eb074af9140d018b828105237fJean Chalard // to what we are searching for. 444588e2f296451a8eb074af9140d018b828105237fJean Chalard const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != 445588e2f296451a8eb074af9140d018b828105237fJean Chalard (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); 446588e2f296451a8eb074af9140d018b828105237fJean Chalard // We will write in `found' whether we have passed the children address we are 447588e2f296451a8eb074af9140d018b828105237fJean Chalard // searching for. For example if we search for "beer", the children of b are less 448588e2f296451a8eb074af9140d018b828105237fJean Chalard // than the address we are searching for and the children of c are greater. When we 449588e2f296451a8eb074af9140d018b828105237fJean Chalard // come here for c, we realize this is too big, and that we should descend b. 450588e2f296451a8eb074af9140d018b828105237fJean Chalard bool found; 451588e2f296451a8eb074af9140d018b828105237fJean Chalard if (hasChildren) { 452588e2f296451a8eb074af9140d018b828105237fJean Chalard // Here comes the tricky part. First, read the children position. 453588e2f296451a8eb074af9140d018b828105237fJean Chalard const int childrenPos = readChildrenPosition(root, flags, pos); 454588e2f296451a8eb074af9140d018b828105237fJean Chalard if (childrenPos > address) { 455588e2f296451a8eb074af9140d018b828105237fJean Chalard // If the children pos is greater than address, it means the previous chargroup, 456588e2f296451a8eb074af9140d018b828105237fJean Chalard // which address is stored in lastCandidateGroupPos, was the right one. 457588e2f296451a8eb074af9140d018b828105237fJean Chalard found = true; 458588e2f296451a8eb074af9140d018b828105237fJean Chalard } else if (1 >= charGroupCount) { 459588e2f296451a8eb074af9140d018b828105237fJean Chalard // However if we are on the LAST group of this node, and we have NOT shot the 460588e2f296451a8eb074af9140d018b828105237fJean Chalard // address we should descend THIS node. So we trick the lastCandidateGroupPos 461588e2f296451a8eb074af9140d018b828105237fJean Chalard // so that we will descend this node, not the previous one. 462588e2f296451a8eb074af9140d018b828105237fJean Chalard lastCandidateGroupPos = startPos; 463588e2f296451a8eb074af9140d018b828105237fJean Chalard found = true; 464588e2f296451a8eb074af9140d018b828105237fJean Chalard } else { 465588e2f296451a8eb074af9140d018b828105237fJean Chalard // Else, we should continue looking. 466588e2f296451a8eb074af9140d018b828105237fJean Chalard found = false; 467588e2f296451a8eb074af9140d018b828105237fJean Chalard } 468588e2f296451a8eb074af9140d018b828105237fJean Chalard } else { 469588e2f296451a8eb074af9140d018b828105237fJean Chalard // Even if we don't have children here, we could still be on the last group of this 470588e2f296451a8eb074af9140d018b828105237fJean Chalard // node. If this is the case, we should descend the last group that had children, 471588e2f296451a8eb074af9140d018b828105237fJean Chalard // and their address is already in lastCandidateGroup. 472588e2f296451a8eb074af9140d018b828105237fJean Chalard found = (1 >= charGroupCount); 473588e2f296451a8eb074af9140d018b828105237fJean Chalard } 474588e2f296451a8eb074af9140d018b828105237fJean Chalard 475588e2f296451a8eb074af9140d018b828105237fJean Chalard if (found) { 476588e2f296451a8eb074af9140d018b828105237fJean Chalard // Okay, we found the group we should descend. Its address is in 477588e2f296451a8eb074af9140d018b828105237fJean Chalard // the lastCandidateGroupPos variable, so we just re-read it. 478588e2f296451a8eb074af9140d018b828105237fJean Chalard if (0 != lastCandidateGroupPos) { 479588e2f296451a8eb074af9140d018b828105237fJean Chalard const uint8_t lastFlags = 480588e2f296451a8eb074af9140d018b828105237fJean Chalard getFlagsAndForwardPointer(root, &lastCandidateGroupPos); 481588e2f296451a8eb074af9140d018b828105237fJean Chalard const int32_t lastChar = 482588e2f296451a8eb074af9140d018b828105237fJean Chalard getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 483588e2f296451a8eb074af9140d018b828105237fJean Chalard // We copy all the characters in this group to the buffer 484588e2f296451a8eb074af9140d018b828105237fJean Chalard outWord[wordPos] = lastChar; 485588e2f296451a8eb074af9140d018b828105237fJean Chalard if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { 486588e2f296451a8eb074af9140d018b828105237fJean Chalard int32_t nextChar = 487588e2f296451a8eb074af9140d018b828105237fJean Chalard getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 488588e2f296451a8eb074af9140d018b828105237fJean Chalard int charCount = maxDepth; 489588e2f296451a8eb074af9140d018b828105237fJean Chalard while (-1 != nextChar && --charCount > 0) { 490588e2f296451a8eb074af9140d018b828105237fJean Chalard outWord[++wordPos] = nextChar; 491588e2f296451a8eb074af9140d018b828105237fJean Chalard nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 492588e2f296451a8eb074af9140d018b828105237fJean Chalard } 493588e2f296451a8eb074af9140d018b828105237fJean Chalard } 494588e2f296451a8eb074af9140d018b828105237fJean Chalard ++wordPos; 495588e2f296451a8eb074af9140d018b828105237fJean Chalard // Now we only need to branch to the children address. Skip the frequency if 496588e2f296451a8eb074af9140d018b828105237fJean Chalard // it's there, read pos, and break to resume the search at pos. 497588e2f296451a8eb074af9140d018b828105237fJean Chalard lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); 498588e2f296451a8eb074af9140d018b828105237fJean Chalard pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); 499588e2f296451a8eb074af9140d018b828105237fJean Chalard break; 500588e2f296451a8eb074af9140d018b828105237fJean Chalard } else { 501588e2f296451a8eb074af9140d018b828105237fJean Chalard // Here is a little tricky part: we come here if we found out that all children 502588e2f296451a8eb074af9140d018b828105237fJean Chalard // addresses in this group are bigger than the address we are searching for. 503588e2f296451a8eb074af9140d018b828105237fJean Chalard // Should we conclude the word is not in the dictionary? No! It could still be 504588e2f296451a8eb074af9140d018b828105237fJean Chalard // one of the remaining chargroups in this node, so we have to keep looking in 505588e2f296451a8eb074af9140d018b828105237fJean Chalard // this node until we find it (or we realize it's not there either, in which 506588e2f296451a8eb074af9140d018b828105237fJean Chalard // case it's actually not in the dictionary). Pass the end of this group, ready 507588e2f296451a8eb074af9140d018b828105237fJean Chalard // to start the next one. 508588e2f296451a8eb074af9140d018b828105237fJean Chalard pos = skipChildrenPosAndAttributes(root, flags, pos); 509588e2f296451a8eb074af9140d018b828105237fJean Chalard } 510588e2f296451a8eb074af9140d018b828105237fJean Chalard } else { 511588e2f296451a8eb074af9140d018b828105237fJean Chalard // If we did not find it, we should record the last children address for the next 512588e2f296451a8eb074af9140d018b828105237fJean Chalard // iteration. 513588e2f296451a8eb074af9140d018b828105237fJean Chalard if (hasChildren) lastCandidateGroupPos = startPos; 514588e2f296451a8eb074af9140d018b828105237fJean Chalard // Now skip the end of this group (children pos and the attributes if any) so that 515588e2f296451a8eb074af9140d018b828105237fJean Chalard // our pos is after the end of this char group, at the start of the next one. 516588e2f296451a8eb074af9140d018b828105237fJean Chalard pos = skipChildrenPosAndAttributes(root, flags, pos); 517588e2f296451a8eb074af9140d018b828105237fJean Chalard } 518588e2f296451a8eb074af9140d018b828105237fJean Chalard 519588e2f296451a8eb074af9140d018b828105237fJean Chalard } 520588e2f296451a8eb074af9140d018b828105237fJean Chalard } 521588e2f296451a8eb074af9140d018b828105237fJean Chalard // If we have looked through all the chargroups and found no match, the address is 522588e2f296451a8eb074af9140d018b828105237fJean Chalard // not the address of a terminal in this dictionary. 523588e2f296451a8eb074af9140d018b828105237fJean Chalard return 0; 524588e2f296451a8eb074af9140d018b828105237fJean Chalard} 525588e2f296451a8eb074af9140d018b828105237fJean Chalard 5269416c814035c65f26ae50c6555de8be84db9860dJean Chalardstatic inline int backoff(const int unigramFreq) { 5279416c814035c65f26ae50c6555de8be84db9860dJean Chalard return unigramFreq; 5289416c814035c65f26ae50c6555de8be84db9860dJean Chalard // For some reason, applying the backoff weight gives bad results in tests. To apply the 5299416c814035c65f26ae50c6555de8be84db9860dJean Chalard // backoff weight, we divide the probability by 2, which in our storing format means 5309416c814035c65f26ae50c6555de8be84db9860dJean Chalard // decreasing the score by 8. 5319416c814035c65f26ae50c6555de8be84db9860dJean Chalard // TODO: figure out what's wrong with this. 5329416c814035c65f26ae50c6555de8be84db9860dJean Chalard // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); 5339416c814035c65f26ae50c6555de8be84db9860dJean Chalard} 5349416c814035c65f26ae50c6555de8be84db9860dJean Chalard 53519ebd936462ee6e4796b8755be82d67437406845Jean Chalardinline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { 53619ebd936462ee6e4796b8755be82d67437406845Jean Chalard // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the 53719ebd936462ee6e4796b8755be82d67437406845Jean Chalard // unigram frequency to be the median value of the 17th step from the top. A value of 53819ebd936462ee6e4796b8755be82d67437406845Jean Chalard // 0 for the bigram frequency represents the middle of the 16th step from the top, 53919ebd936462ee6e4796b8755be82d67437406845Jean Chalard // while a value of 15 represents the middle of the top step. 54019ebd936462ee6e4796b8755be82d67437406845Jean Chalard // See makedict.BinaryDictInputOutput for details. 54119ebd936462ee6e4796b8755be82d67437406845Jean Chalard const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); 542cb99376307f0d57e2935449f93fc162253dcdd01Jean Chalard return (int)(unigramFreq + (bigramFreq + 1) * stepSize); 54319ebd936462ee6e4796b8755be82d67437406845Jean Chalard} 54419ebd936462ee6e4796b8755be82d67437406845Jean Chalard 5459416c814035c65f26ae50c6555de8be84db9860dJean Chalard// This returns a probability in log space. 54649ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalardinline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, 5478950ce6c44706467bb386570ae236a2b8b983666Jean Chalard const uint8_t *bigramFilter, const int unigramFreq) { 5489416c814035c65f26ae50c6555de8be84db9860dJean Chalard if (!bigramMap || !bigramFilter) return backoff(unigramFreq); 5499416c814035c65f26ae50c6555de8be84db9860dJean Chalard if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); 5509416c814035c65f26ae50c6555de8be84db9860dJean Chalard const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); 5519416c814035c65f26ae50c6555de8be84db9860dJean Chalard if (bigramFreqIt != bigramMap->end()) { 5529416c814035c65f26ae50c6555de8be84db9860dJean Chalard const int bigramFreq = bigramFreqIt->second; 55319ebd936462ee6e4796b8755be82d67437406845Jean Chalard return computeFrequencyForBigram(unigramFreq, bigramFreq); 55449ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard } else { 5559416c814035c65f26ae50c6555de8be84db9860dJean Chalard return backoff(unigramFreq); 55649ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard } 557171d1809ffc724de4fb793f481d592644e3d141eJean Chalard} 558171d1809ffc724de4fb793f481d592644e3d141eJean Chalard 5591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} // namespace latinime 5601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard 5611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard#endif // LATINIME_BINARY_FORMAT_H 562