1647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi/*
2647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project
3647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi *
4647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License");
5647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * you may not use this file except in compliance with the License.
6647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * You may obtain a copy of the License at
7647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi *
8647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi *     http://www.apache.org/licenses/LICENSE-2.0
9647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi *
10647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software
11647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS,
12647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * See the License for the specific language governing permissions and
14647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * limitations under the License.
15647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi */
16647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
1788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
18647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
19647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi#include "defines.h"
2088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
2188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
2288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/byte_array_utils.h"
23647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
24647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynaginamespace latinime {
25647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
26647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagitypedef PatriciaTrieReadingUtils PtReadingUtils;
27647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
2827b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
2927b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
3027b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
3127b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
3227b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
33647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
34647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for single/multiple char group
35647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
3627b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi// Flag for terminal PtNodes
37647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
38647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for shortcut targets presence
39647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
40647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for bigram presence
41647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
42647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for non-words (typically, shortcut only entries)
43647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
4405172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu// Flag for possibly offensive words
4505172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicuconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
46647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
479ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
489ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        const uint8_t *const buffer, int *const pos) {
499ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
509ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    if (firstByte < 0x80) {
519ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        return firstByte;
529ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    } else {
539ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
549ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi                buffer, pos);
559ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    }
569ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi}
579ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi
589ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
599ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        const uint8_t *const buffer, int *const pos) {
609ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
619ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi}
629ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi
639ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
64fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        const int *const codePointTable, int *const pos) {
65fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto    return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
669ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi}
679ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi
689ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi// Returns the number of read characters.
699ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
70fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        const NodeFlags flags, const int maxLength, const int *const codePointTable,
71fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        int *const outBuffer, int *const pos) {
729ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    int length = 0;
739ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    if (hasMultipleChars(flags)) {
74fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
75fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto                outBuffer, pos);
769ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    } else {
77fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
789155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi        if (codePoint == NOT_A_CODE_POINT) {
799155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
809155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
819155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            // when the PtNode has a single code point.
829155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            length = 0;
839155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
849155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi                    *pos - 1, codePoint, buffer[*pos - 1]);
859155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            ASSERT(false);
869155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi        } else if (maxLength > 0) {
879155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi            outBuffer[0] = codePoint;
889ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi            length = 1;
899ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        }
909ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    }
919ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    return length;
929ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi}
939ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi
949ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi// Returns the number of skipped characters.
959ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
96fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        const int maxLength, const int *const codePointTable, int *const pos) {
979ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    if (hasMultipleChars(flags)) {
989ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
999ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    } else {
1009ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        if (maxLength > 0) {
101fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto            getCodePointAndAdvancePosition(buffer, codePointTable, pos);
1029ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi            return 1;
1039ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        } else {
1049ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi            return 0;
1059ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        }
1069ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    }
1079ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi}
1089ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi
1099ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
1109ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi        int *const pos) {
1119ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi    return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
1129ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi}
1139ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi
114647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
115647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi        const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
116647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi    const int base = *pos;
117647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi    int offset = 0;
11827b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi    switch (MASK_CHILDREN_POSITION_TYPE & flags) {
11927b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi        case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
120647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
121647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            break;
12227b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi        case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
123647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
124647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            break;
12527b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi        case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
126647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
127647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            break;
128647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi        default:
129647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            // If we come here, it means we asked for the children of a word with
130647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            // no children.
131647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi            return NOT_A_DICT_POS;
132647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi    }
133647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi    return base + offset;
134647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi}
135647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi
1361e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
1371e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
138fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto        const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
1391e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
1401e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
1411e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        int *const outBigramPos, int *const outSiblingPos) {
1421e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    int readingPos = ptNodePos;
1431e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
1441e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outFlags = flags;
1451e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outCodePointCount = getCharsAndAdvancePosition(
146fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto            dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
1471e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outProbability = isTerminal(flags) ?
1481e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi            readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
1491e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outChildrenPos = hasChildrenInFlags(flags) ?
1501e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi            readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
1511e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outShortcutPos = NOT_A_DICT_POS;
1521e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    if (hasShortcutTargets(flags)) {
1531e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        *outShortcutPos = readingPos;
1541e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        shortcutPolicy->skipAllShortcuts(&readingPos);
1551e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    }
1561e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outBigramPos = NOT_A_DICT_POS;
1571e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    if (hasBigrams(flags)) {
1581e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        *outBigramPos = readingPos;
1591e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi        bigramPolicy->skipAllBigrams(&readingPos);
1601e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    }
1611e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi    *outSiblingPos = readingPos;
1621e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi}
1631e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi
164647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi} // namespace latinime
165