11059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard/*
21059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * Copyright (C) 2011 The Android Open Source Project
31059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *
41059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * Licensed under the Apache License, Version 2.0 (the "License");
51059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * you may not use this file except in compliance with the License.
61059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * You may obtain a copy of the License at
71059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *
81059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *      http://www.apache.org/licenses/LICENSE-2.0
91059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard *
101059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * Unless required by applicable law or agreed to in writing, software
111059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * distributed under the License is distributed on an "AS IS" BASIS,
121059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
131059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * See the License for the specific language governing permissions and
141059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard * limitations under the License.
151059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard */
161059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
171059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard#ifndef LATINIME_BINARY_FORMAT_H
181059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard#define LATINIME_BINARY_FORMAT_H
191059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard#include <limits>
2149ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard#include "bloom_filter.h"
22f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard#include "unigram_dictionary.h"
23f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard
241059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardnamespace latinime {
251059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
261059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardclass BinaryFormat {
27e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa private:
281059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
291059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
301059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
311059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
32e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa public:
33f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard    const static int UNKNOWN_FORMAT = -1;
3446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // Originally, format version 1 had a 16-bit magic number, then the version number `01'
3546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // then options that must be 0. Hence the first 32-bits of the format are always as follow
3646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // and it's okay to consider them a magic number as a whole.
3746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
3846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
3946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // The versions of Latin IME that only handle format version 1 only test for the magic
4046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // number, so we had to change it so that version 2 files would be rejected by older
4146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // implementations. On this occasion, we made the magic number 32 bits long.
4246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
43f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard
449a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
459a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    const static int SHORTCUT_LIST_SIZE_SIZE = 2;
469a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard
47f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard    static int detectFormat(const uint8_t* const dict);
4846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    static unsigned int getHeaderSize(const uint8_t* const dict);
49e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    static unsigned int getFlags(const uint8_t* const dict);
501059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
511059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
521059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
531059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
541059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int skipOtherCharacters(const uint8_t* const dict, const int pos);
551059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int skipChildrenPosition(const uint8_t flags, const int pos);
561059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int skipFrequency(const uint8_t flags, const int pos);
579a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
589a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            const int pos);
621059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
631059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static bool hasChildrenInFlags(const uint8_t flags);
641059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
651059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            int *pos);
66522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard    static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
676a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            const int length);
68588e2f296451a8eb074af9140d018b828105237fJean Chalard    static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
69e308459531a4dd64ee80aa76e351725180ad856eJean Chalard            uint16_t* outWord, int* outUnigramFrequency);
7019ebd936462ee6e4796b8755be82d67437406845Jean Chalard    static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
7149ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard    static int getProbability(const int position, const std::map<int, int> *bigramMap,
7249ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard            const uint8_t *bigramFilter, const int unigramFreq);
73e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard
74e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    // Flags for special processing
75e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
76e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    // something very bad (like, the apocalypse) will happen. Please update both at the same time.
77e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    enum {
78e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard        REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
79e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard        REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
80e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    };
81e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    const static unsigned int NO_FLAGS = 0;
821059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard};
831059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
84f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalardinline int BinaryFormat::detectFormat(const uint8_t* const dict) {
8546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    // The magic number is stored big-endian.
8646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
8746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    switch (magicNumber) {
8846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    case FORMAT_VERSION_1_MAGIC_NUMBER:
8946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Format 1 header is exactly 5 bytes long and looks like:
9046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Magic number (2 bytes) 0x78 0xB1
9146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Version number (1 byte) 0x01
9246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Options (2 bytes) must be 0x00 0x00
9346a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        return 1;
9446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    case FORMAT_VERSION_2_MAGIC_NUMBER:
9546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Format 2 header is as follows:
9646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
9746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Version number (2 bytes) 0x00 0x02
985b0761e6a94227d6ef788f589fb6edcd44ed791fJean Chalard        // Options (2 bytes)
9946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // Header size (4 bytes) : integer, big endian
10046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        return (dict[4] << 8) + dict[5];
10146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    default:
10246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        return UNKNOWN_FORMAT;
10346a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    }
10446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard}
10546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard
106e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalardinline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) {
107e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    switch (detectFormat(dict)) {
108e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    case 1:
109e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard        return NO_FLAGS;
110e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    default:
111e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard        return (dict[6] << 8) + dict[7];
112e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard    }
113e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard}
114e81ac8baa0dc0e8d671c813b93100070c23b9a1dJean Chalard
11546a1eec4d86f4b47434275065d3170728255f2c8Jean Chalardinline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
11646a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    switch (detectFormat(dict)) {
11746a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    case 1:
11846a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        return FORMAT_VERSION_1_HEADER_SIZE;
11946a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    case 2:
12046a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        // See the format of the header in the comment in detectFormat() above
12146a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
12246a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    default:
12346a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard        return std::numeric_limits<unsigned int>::max();
12446a1eec4d86f4b47434275065d3170728255f2c8Jean Chalard    }
125f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard}
126f0a980966264fa98ef8e1b834650d9bf54de92aeJean Chalard
1271059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
1284c0eca6e416485be61d7fddcad1e1552444daf85Jean Chalard    const int msb = dict[(*pos)++];
1294c0eca6e416485be61d7fddcad1e1552444daf85Jean Chalard    if (msb < 0x80) return msb;
1304c0eca6e416485be61d7fddcad1e1552444daf85Jean Chalard    return ((msb & 0x7F) << 8) | dict[(*pos)++];
1311059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1321059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1331059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
1341059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return dict[(*pos)++];
1351059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1361059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1371059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
1381059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    const int origin = *pos;
1391059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    const int32_t character = dict[origin];
1401059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
1411059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        if (character == CHARACTER_ARRAY_TERMINATOR) {
1421059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            *pos = origin + 1;
1431059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            return NOT_A_CHARACTER;
1441059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        } else {
1451059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            *pos = origin + 3;
1461059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            const int32_t char_1 = character << 16;
1471059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
1481059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            return char_2 + dict[origin + 2];
1491059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        }
1501059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    } else {
1511059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        *pos = origin + 1;
1521059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        return character;
1531059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
1541059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1551059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1561059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
1571059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        const int pos) {
1581059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return dict[pos];
1591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
1621059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    int currentPos = pos;
1631059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    int32_t character = dict[currentPos++];
1641059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    while (CHARACTER_ARRAY_TERMINATOR != character) {
1651059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
1661059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
1671059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        }
1681059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        character = dict[currentPos++];
1691059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
1701059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return currentPos;
1711059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1721059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1731059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardstatic inline int attributeAddressSize(const uint8_t flags) {
1741059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
1751059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
1761059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    /* Note: this is a value-dependant optimization of what may probably be
1771059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       more readably written this way:
1781059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
1791059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
1801059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
1811059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
1821059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       default: return 0;
1831059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard       }
1841059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    */
1851059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1861059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1879a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardstatic inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
1881059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    int currentPos = pos;
1899a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
1901059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
1911059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        currentPos += attributeAddressSize(flags);
1929a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
1931059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
1941059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    currentPos += attributeAddressSize(flags);
1951059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return currentPos;
1961059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
1971059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
1981059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardstatic inline int childrenAddressSize(const uint8_t flags) {
1991059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    static const int CHILDREN_ADDRESS_SHIFT = 6;
2001059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
2011059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    /* See the note in attributeAddressSize. The same applies here */
2021059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2031059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2049a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardstatic inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
2059a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
2069a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard}
2079a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard
2081059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
2091059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return pos + childrenAddressSize(flags);
2101059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2111059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2121059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
2131059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
2141059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2151059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2169a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardinline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
2171059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        const int pos) {
218e0e339699addbdc837b79c110a8432f0641d16eeJean Chalard    if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
2199a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        return pos + shortcutByteSize(dict, pos);
2209a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    } else {
2219a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        return pos;
222e0e339699addbdc837b79c110a8432f0641d16eeJean Chalard    }
2239a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard}
2249a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard
2259a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardinline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
2269a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        const int pos) {
2271059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
2289a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        return skipExistingBigrams(dict, pos);
2299a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    } else {
2309a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        return pos;
2311059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
2329a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard}
2339a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard
2349a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalardinline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
2359a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard        const int pos) {
2369a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    // This function skips all attributes: shortcuts and bigrams.
2379a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    int newPos = pos;
2389a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    newPos = skipShortcuts(dict, flags, newPos);
2399a933a742d2a3ffdfb955705ad086035bc27db60Jean Chalard    newPos = skipBigrams(dict, flags, newPos);
240e0e339699addbdc837b79c110a8432f0641d16eeJean Chalard    return newPos;
2411059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2421059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2431059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
2441059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        const uint8_t flags, const int pos) {
2451059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    int currentPos = pos;
2461059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    currentPos = skipChildrenPosition(flags, currentPos);
2471059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    currentPos = skipAllAttributes(dict, flags, currentPos);
2481059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return currentPos;
2491059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2501059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2511059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
2521059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        const int pos) {
2531059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    int offset = 0;
2541059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
2551059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
2561059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset = dict[pos];
2571059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            break;
2581059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
2591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset = dict[pos] << 8;
2601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset += dict[pos + 1];
2611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            break;
2621059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
2631059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset = dict[pos] << 16;
2641059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset += dict[pos + 1] << 8;
2651059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset += dict[pos + 2];
2661059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            break;
2671059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        default:
2681059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            // If we come here, it means we asked for the children of a word with
2691059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            // no children.
2701059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            return -1;
2711059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
2721059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return pos + offset;
2731059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2741059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2751059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
2761059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
2771059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
2781059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
2791059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
2801059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalardinline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
2811059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        const uint8_t flags, int *pos) {
2821059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    int offset = 0;
2831059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    const int origin = *pos;
2841059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
2851059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
2861059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset = dict[origin];
2871059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            *pos = origin + 1;
2881059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            break;
2891059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
2901059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset = dict[origin] << 8;
2911059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset += dict[origin + 1];
2921059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            *pos = origin + 2;
2931059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            break;
2941059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
2951059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset = dict[origin] << 16;
2961059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset += dict[origin + 1] << 8;
2971059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            offset += dict[origin + 2];
2981059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            *pos = origin + 3;
2991059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard            break;
3001059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
3011059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
3021059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        return origin - offset;
3031059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    } else {
3041059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard        return origin + offset;
3051059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard    }
3061059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard}
3071059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
3086a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard// This function gets the byte position of the last chargroup of the exact matching word in the
3096a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard// dictionary. If no match is found, it returns NOT_VALID_WORD.
3106a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalardinline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
311522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard        const int32_t* const inWord, const int length) {
3126a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard    int pos = 0;
3136a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard    int wordPos = 0;
3146a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard
3156a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard    while (true) {
3166a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard        // If we already traversed the tree further than the word is long, there means
3176a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard        // there was no match (or we would have found it).
3186a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard        if (wordPos > length) return NOT_VALID_WORD;
3196a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard        int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
320522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard        const int32_t wChar = inWord[wordPos];
3216a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard        while (true) {
3226a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            // If there are no more character groups in this node, it means we could not
3236a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            // find a matching character for this depth, therefore there is no match.
3246a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            if (0 >= charGroupCount) return NOT_VALID_WORD;
3256a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            const int charGroupPos = pos;
3266a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
3276a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
3286a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            if (character == wChar) {
3296a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // This is the correct node. Only one character group may start with the same
3306a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // char within a node, so either we found our match in this node, or there is
3316a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // no match and we can return NOT_VALID_WORD. So we will check all the characters
3326a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // in this character group indeed does match.
3336a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
3346a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
3356a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    while (NOT_A_CHARACTER != character) {
3366a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        ++wordPos;
3376a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        // If we shoot the length of the word we search for, or if we find a single
3386a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        // character that does not match, as explained above, it means the word is
3396a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        // not in the dictionary (by virtue of this chargroup being the only one to
3406a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        // match the word on the first character, but not matching the whole word).
3416a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        if (wordPos > length) return NOT_VALID_WORD;
3426a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        if (inWord[wordPos] != character) return NOT_VALID_WORD;
3436a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
3446a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    }
3456a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                }
3466a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // If we come here we know that so far, we do match. Either we are on a terminal
3476a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // and we match the length, in which case we found it, or we traverse children.
3486a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // If we don't match the length AND don't have children, then a word in the
3496a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // dictionary fully matches a prefix of the searched word but not the full word.
3506a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                ++wordPos;
3516a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
3526a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    if (wordPos == length) {
3536a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        return charGroupPos;
3546a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    }
3556a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
3566a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                }
3576a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
3586a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                        == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
3596a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    return NOT_VALID_WORD;
3606a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                }
3616a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // We have children and we are still shorter than the word we are searching for, so
3626a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // we need to traverse children. Put the pointer on the children position, and
3636a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // break
3646a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                pos = BinaryFormat::readChildrenPosition(root, flags, pos);
3656a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                break;
3666a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            } else {
3676a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                // This chargroup does not match, so skip the remaining part and go to the next.
3686a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
3696a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                    pos = BinaryFormat::skipOtherCharacters(root, pos);
3706a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                }
3716a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                pos = BinaryFormat::skipFrequency(flags, pos);
3726a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard                pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
3736a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            }
3746a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard            --charGroupCount;
3756a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard        }
3766a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard    }
3776a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard}
3786a0e9642a8d1046e3b730c6dd1a633a4ec0f656fJean Chalard
379588e2f296451a8eb074af9140d018b828105237fJean Chalard// This function searches for a terminal in the dictionary by its address.
380588e2f296451a8eb074af9140d018b828105237fJean Chalard// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
381588e2f296451a8eb074af9140d018b828105237fJean Chalard// it is possible to check for this with advantageous complexity. For each node, we search
382588e2f296451a8eb074af9140d018b828105237fJean Chalard// for groups with children and compare the children address with the address we look for.
383588e2f296451a8eb074af9140d018b828105237fJean Chalard// When we shoot the address we look for, it means the word we look for is in the children
384588e2f296451a8eb074af9140d018b828105237fJean Chalard// of the previous group. The only tricky part is the fact that if we arrive at the end of a
385588e2f296451a8eb074af9140d018b828105237fJean Chalard// node with the last group's children address still less than what we are searching for, we
386588e2f296451a8eb074af9140d018b828105237fJean Chalard// must descend the last group's children (for example, if the word we are searching for starts
387588e2f296451a8eb074af9140d018b828105237fJean Chalard// with a z, it's the last group of the root node, so all children addresses will be smaller
388588e2f296451a8eb074af9140d018b828105237fJean Chalard// than the address we look for, and we have to descend the z node).
389588e2f296451a8eb074af9140d018b828105237fJean Chalard/* Parameters :
390588e2f296451a8eb074af9140d018b828105237fJean Chalard * root: the dictionary buffer
391588e2f296451a8eb074af9140d018b828105237fJean Chalard * address: the byte position of the last chargroup of the word we are searching for (this is
392588e2f296451a8eb074af9140d018b828105237fJean Chalard *   what is stored as the "bigram address" in each bigram)
393588e2f296451a8eb074af9140d018b828105237fJean Chalard * outword: an array to write the found word, with MAX_WORD_LENGTH size.
394e308459531a4dd64ee80aa76e351725180ad856eJean Chalard * outUnigramFrequency: a pointer to an int to write the frequency into.
395588e2f296451a8eb074af9140d018b828105237fJean Chalard * Return value : the length of the word, of 0 if the word was not found.
396588e2f296451a8eb074af9140d018b828105237fJean Chalard */
397588e2f296451a8eb074af9140d018b828105237fJean Chalardinline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
398e308459531a4dd64ee80aa76e351725180ad856eJean Chalard        const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) {
399588e2f296451a8eb074af9140d018b828105237fJean Chalard    int pos = 0;
400588e2f296451a8eb074af9140d018b828105237fJean Chalard    int wordPos = 0;
401588e2f296451a8eb074af9140d018b828105237fJean Chalard
402588e2f296451a8eb074af9140d018b828105237fJean Chalard    // One iteration of the outer loop iterates through nodes. As stated above, we will only
403588e2f296451a8eb074af9140d018b828105237fJean Chalard    // traverse nodes that are actually a part of the terminal we are searching, so each time
404588e2f296451a8eb074af9140d018b828105237fJean Chalard    // we enter this loop we are one depth level further than last time.
405588e2f296451a8eb074af9140d018b828105237fJean Chalard    // The only reason we count nodes is because we want to reduce the probability of infinite
406588e2f296451a8eb074af9140d018b828105237fJean Chalard    // looping in case there is a bug. Since we know there is an upper bound to the depth we are
407588e2f296451a8eb074af9140d018b828105237fJean Chalard    // supposed to traverse, it does not hurt to count iterations.
408588e2f296451a8eb074af9140d018b828105237fJean Chalard    for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
409588e2f296451a8eb074af9140d018b828105237fJean Chalard        int lastCandidateGroupPos = 0;
410588e2f296451a8eb074af9140d018b828105237fJean Chalard        // Let's loop through char groups in this node searching for either the terminal
411588e2f296451a8eb074af9140d018b828105237fJean Chalard        // or one of its ascendants.
412588e2f296451a8eb074af9140d018b828105237fJean Chalard        for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
413588e2f296451a8eb074af9140d018b828105237fJean Chalard                 --charGroupCount) {
414588e2f296451a8eb074af9140d018b828105237fJean Chalard            const int startPos = pos;
415588e2f296451a8eb074af9140d018b828105237fJean Chalard            const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
416588e2f296451a8eb074af9140d018b828105237fJean Chalard            const int32_t character = getCharCodeAndForwardPointer(root, &pos);
417588e2f296451a8eb074af9140d018b828105237fJean Chalard            if (address == startPos) {
418588e2f296451a8eb074af9140d018b828105237fJean Chalard                // We found the address. Copy the rest of the word in the buffer and return
419588e2f296451a8eb074af9140d018b828105237fJean Chalard                // the length.
420588e2f296451a8eb074af9140d018b828105237fJean Chalard                outWord[wordPos] = character;
421588e2f296451a8eb074af9140d018b828105237fJean Chalard                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
422588e2f296451a8eb074af9140d018b828105237fJean Chalard                    int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
423588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // We count chars in order to avoid infinite loops if the file is broken or
424588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // if there is some other bug
425588e2f296451a8eb074af9140d018b828105237fJean Chalard                    int charCount = maxDepth;
426402b0570505c7ea1389e1c153e5db0300568ce26Jean Chalard                    while (NOT_A_CHARACTER != nextChar && --charCount > 0) {
427588e2f296451a8eb074af9140d018b828105237fJean Chalard                        outWord[++wordPos] = nextChar;
428588e2f296451a8eb074af9140d018b828105237fJean Chalard                        nextChar = getCharCodeAndForwardPointer(root, &pos);
429588e2f296451a8eb074af9140d018b828105237fJean Chalard                    }
430588e2f296451a8eb074af9140d018b828105237fJean Chalard                }
431e308459531a4dd64ee80aa76e351725180ad856eJean Chalard                *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
432588e2f296451a8eb074af9140d018b828105237fJean Chalard                return ++wordPos;
433588e2f296451a8eb074af9140d018b828105237fJean Chalard            }
434588e2f296451a8eb074af9140d018b828105237fJean Chalard            // We need to skip past this char group, so skip any remaining chars after the
435588e2f296451a8eb074af9140d018b828105237fJean Chalard            // first and possibly the frequency.
436588e2f296451a8eb074af9140d018b828105237fJean Chalard            if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
437588e2f296451a8eb074af9140d018b828105237fJean Chalard                pos = skipOtherCharacters(root, pos);
438588e2f296451a8eb074af9140d018b828105237fJean Chalard            }
439588e2f296451a8eb074af9140d018b828105237fJean Chalard            pos = skipFrequency(flags, pos);
440588e2f296451a8eb074af9140d018b828105237fJean Chalard
441588e2f296451a8eb074af9140d018b828105237fJean Chalard            // The fact that this group has children is very important. Since we already know
442588e2f296451a8eb074af9140d018b828105237fJean Chalard            // that this group does not match, if it has no children we know it is irrelevant
443588e2f296451a8eb074af9140d018b828105237fJean Chalard            // to what we are searching for.
444588e2f296451a8eb074af9140d018b828105237fJean Chalard            const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
445588e2f296451a8eb074af9140d018b828105237fJean Chalard                    (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
446588e2f296451a8eb074af9140d018b828105237fJean Chalard            // We will write in `found' whether we have passed the children address we are
447588e2f296451a8eb074af9140d018b828105237fJean Chalard            // searching for. For example if we search for "beer", the children of b are less
448588e2f296451a8eb074af9140d018b828105237fJean Chalard            // than the address we are searching for and the children of c are greater. When we
449588e2f296451a8eb074af9140d018b828105237fJean Chalard            // come here for c, we realize this is too big, and that we should descend b.
450588e2f296451a8eb074af9140d018b828105237fJean Chalard            bool found;
451588e2f296451a8eb074af9140d018b828105237fJean Chalard            if (hasChildren) {
452588e2f296451a8eb074af9140d018b828105237fJean Chalard                // Here comes the tricky part. First, read the children position.
453588e2f296451a8eb074af9140d018b828105237fJean Chalard                const int childrenPos = readChildrenPosition(root, flags, pos);
454588e2f296451a8eb074af9140d018b828105237fJean Chalard                if (childrenPos > address) {
455588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // If the children pos is greater than address, it means the previous chargroup,
456588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // which address is stored in lastCandidateGroupPos, was the right one.
457588e2f296451a8eb074af9140d018b828105237fJean Chalard                    found = true;
458588e2f296451a8eb074af9140d018b828105237fJean Chalard                } else if (1 >= charGroupCount) {
459588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // However if we are on the LAST group of this node, and we have NOT shot the
460588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // address we should descend THIS node. So we trick the lastCandidateGroupPos
461588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // so that we will descend this node, not the previous one.
462588e2f296451a8eb074af9140d018b828105237fJean Chalard                    lastCandidateGroupPos = startPos;
463588e2f296451a8eb074af9140d018b828105237fJean Chalard                    found = true;
464588e2f296451a8eb074af9140d018b828105237fJean Chalard                } else {
465588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // Else, we should continue looking.
466588e2f296451a8eb074af9140d018b828105237fJean Chalard                    found = false;
467588e2f296451a8eb074af9140d018b828105237fJean Chalard                }
468588e2f296451a8eb074af9140d018b828105237fJean Chalard            } else {
469588e2f296451a8eb074af9140d018b828105237fJean Chalard                // Even if we don't have children here, we could still be on the last group of this
470588e2f296451a8eb074af9140d018b828105237fJean Chalard                // node. If this is the case, we should descend the last group that had children,
471588e2f296451a8eb074af9140d018b828105237fJean Chalard                // and their address is already in lastCandidateGroup.
472588e2f296451a8eb074af9140d018b828105237fJean Chalard                found = (1 >= charGroupCount);
473588e2f296451a8eb074af9140d018b828105237fJean Chalard            }
474588e2f296451a8eb074af9140d018b828105237fJean Chalard
475588e2f296451a8eb074af9140d018b828105237fJean Chalard            if (found) {
476588e2f296451a8eb074af9140d018b828105237fJean Chalard                // Okay, we found the group we should descend. Its address is in
477588e2f296451a8eb074af9140d018b828105237fJean Chalard                // the lastCandidateGroupPos variable, so we just re-read it.
478588e2f296451a8eb074af9140d018b828105237fJean Chalard                if (0 != lastCandidateGroupPos) {
479588e2f296451a8eb074af9140d018b828105237fJean Chalard                    const uint8_t lastFlags =
480588e2f296451a8eb074af9140d018b828105237fJean Chalard                            getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
481588e2f296451a8eb074af9140d018b828105237fJean Chalard                    const int32_t lastChar =
482588e2f296451a8eb074af9140d018b828105237fJean Chalard                            getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
483588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // We copy all the characters in this group to the buffer
484588e2f296451a8eb074af9140d018b828105237fJean Chalard                    outWord[wordPos] = lastChar;
485588e2f296451a8eb074af9140d018b828105237fJean Chalard                    if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
486588e2f296451a8eb074af9140d018b828105237fJean Chalard                        int32_t nextChar =
487588e2f296451a8eb074af9140d018b828105237fJean Chalard                                getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
488588e2f296451a8eb074af9140d018b828105237fJean Chalard                        int charCount = maxDepth;
489588e2f296451a8eb074af9140d018b828105237fJean Chalard                        while (-1 != nextChar && --charCount > 0) {
490588e2f296451a8eb074af9140d018b828105237fJean Chalard                            outWord[++wordPos] = nextChar;
491588e2f296451a8eb074af9140d018b828105237fJean Chalard                            nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
492588e2f296451a8eb074af9140d018b828105237fJean Chalard                        }
493588e2f296451a8eb074af9140d018b828105237fJean Chalard                    }
494588e2f296451a8eb074af9140d018b828105237fJean Chalard                    ++wordPos;
495588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // Now we only need to branch to the children address. Skip the frequency if
496588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // it's there, read pos, and break to resume the search at pos.
497588e2f296451a8eb074af9140d018b828105237fJean Chalard                    lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
498588e2f296451a8eb074af9140d018b828105237fJean Chalard                    pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
499588e2f296451a8eb074af9140d018b828105237fJean Chalard                    break;
500588e2f296451a8eb074af9140d018b828105237fJean Chalard                } else {
501588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // Here is a little tricky part: we come here if we found out that all children
502588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // addresses in this group are bigger than the address we are searching for.
503588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // Should we conclude the word is not in the dictionary? No! It could still be
504588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // one of the remaining chargroups in this node, so we have to keep looking in
505588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // this node until we find it (or we realize it's not there either, in which
506588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // case it's actually not in the dictionary). Pass the end of this group, ready
507588e2f296451a8eb074af9140d018b828105237fJean Chalard                    // to start the next one.
508588e2f296451a8eb074af9140d018b828105237fJean Chalard                    pos = skipChildrenPosAndAttributes(root, flags, pos);
509588e2f296451a8eb074af9140d018b828105237fJean Chalard                }
510588e2f296451a8eb074af9140d018b828105237fJean Chalard            } else {
511588e2f296451a8eb074af9140d018b828105237fJean Chalard                // If we did not find it, we should record the last children address for the next
512588e2f296451a8eb074af9140d018b828105237fJean Chalard                // iteration.
513588e2f296451a8eb074af9140d018b828105237fJean Chalard                if (hasChildren) lastCandidateGroupPos = startPos;
514588e2f296451a8eb074af9140d018b828105237fJean Chalard                // Now skip the end of this group (children pos and the attributes if any) so that
515588e2f296451a8eb074af9140d018b828105237fJean Chalard                // our pos is after the end of this char group, at the start of the next one.
516588e2f296451a8eb074af9140d018b828105237fJean Chalard                pos = skipChildrenPosAndAttributes(root, flags, pos);
517588e2f296451a8eb074af9140d018b828105237fJean Chalard            }
518588e2f296451a8eb074af9140d018b828105237fJean Chalard
519588e2f296451a8eb074af9140d018b828105237fJean Chalard        }
520588e2f296451a8eb074af9140d018b828105237fJean Chalard    }
521588e2f296451a8eb074af9140d018b828105237fJean Chalard    // If we have looked through all the chargroups and found no match, the address is
522588e2f296451a8eb074af9140d018b828105237fJean Chalard    // not the address of a terminal in this dictionary.
523588e2f296451a8eb074af9140d018b828105237fJean Chalard    return 0;
524588e2f296451a8eb074af9140d018b828105237fJean Chalard}
525588e2f296451a8eb074af9140d018b828105237fJean Chalard
5269416c814035c65f26ae50c6555de8be84db9860dJean Chalardstatic inline int backoff(const int unigramFreq) {
5279416c814035c65f26ae50c6555de8be84db9860dJean Chalard    return unigramFreq;
5289416c814035c65f26ae50c6555de8be84db9860dJean Chalard    // For some reason, applying the backoff weight gives bad results in tests. To apply the
5299416c814035c65f26ae50c6555de8be84db9860dJean Chalard    // backoff weight, we divide the probability by 2, which in our storing format means
5309416c814035c65f26ae50c6555de8be84db9860dJean Chalard    // decreasing the score by 8.
5319416c814035c65f26ae50c6555de8be84db9860dJean Chalard    // TODO: figure out what's wrong with this.
5329416c814035c65f26ae50c6555de8be84db9860dJean Chalard    // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
5339416c814035c65f26ae50c6555de8be84db9860dJean Chalard}
5349416c814035c65f26ae50c6555de8be84db9860dJean Chalard
53519ebd936462ee6e4796b8755be82d67437406845Jean Chalardinline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
53619ebd936462ee6e4796b8755be82d67437406845Jean Chalard    // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
53719ebd936462ee6e4796b8755be82d67437406845Jean Chalard    // unigram frequency to be the median value of the 17th step from the top. A value of
53819ebd936462ee6e4796b8755be82d67437406845Jean Chalard    // 0 for the bigram frequency represents the middle of the 16th step from the top,
53919ebd936462ee6e4796b8755be82d67437406845Jean Chalard    // while a value of 15 represents the middle of the top step.
54019ebd936462ee6e4796b8755be82d67437406845Jean Chalard    // See makedict.BinaryDictInputOutput for details.
54119ebd936462ee6e4796b8755be82d67437406845Jean Chalard    const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
542cb99376307f0d57e2935449f93fc162253dcdd01Jean Chalard    return (int)(unigramFreq + (bigramFreq + 1) * stepSize);
54319ebd936462ee6e4796b8755be82d67437406845Jean Chalard}
54419ebd936462ee6e4796b8755be82d67437406845Jean Chalard
5459416c814035c65f26ae50c6555de8be84db9860dJean Chalard// This returns a probability in log space.
54649ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalardinline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
5478950ce6c44706467bb386570ae236a2b8b983666Jean Chalard        const uint8_t *bigramFilter, const int unigramFreq) {
5489416c814035c65f26ae50c6555de8be84db9860dJean Chalard    if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
5499416c814035c65f26ae50c6555de8be84db9860dJean Chalard    if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
5509416c814035c65f26ae50c6555de8be84db9860dJean Chalard    const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
5519416c814035c65f26ae50c6555de8be84db9860dJean Chalard    if (bigramFreqIt != bigramMap->end()) {
5529416c814035c65f26ae50c6555de8be84db9860dJean Chalard        const int bigramFreq = bigramFreqIt->second;
55319ebd936462ee6e4796b8755be82d67437406845Jean Chalard        return computeFrequencyForBigram(unigramFreq, bigramFreq);
55449ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard    } else {
5559416c814035c65f26ae50c6555de8be84db9860dJean Chalard        return backoff(unigramFreq);
55649ba135fdedb3c6b33ec915e91ecad682b7655b8Jean Chalard    }
557171d1809ffc724de4fb793f481d592644e3d141eJean Chalard}
558171d1809ffc724de4fb793f481d592644e3d141eJean Chalard
5591059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard} // namespace latinime
5601059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard
5611059f273649ea9cf4dd3c9c3466ec6fed5496a54Jean Chalard#endif // LATINIME_BINARY_FORMAT_H
562