1647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi/* 2647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project 3647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * 4647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License"); 5647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * you may not use this file except in compliance with the License. 6647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * You may obtain a copy of the License at 7647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * 8647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * http://www.apache.org/licenses/LICENSE-2.0 9647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * 10647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software 11647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS, 12647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * See the License for the specific language governing permissions and 14647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi * limitations under the License. 15647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi */ 16647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 1788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" 18647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 19647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi#include "defines.h" 2088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_bigrams_structure_policy.h" 2188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" 2288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/byte_array_utils.h" 23647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 24647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynaginamespace latinime { 25647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 26647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagitypedef PatriciaTrieReadingUtils PtReadingUtils; 27647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 2827b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; 2927b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; 3027b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; 3127b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; 3227b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; 33647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 34647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for single/multiple char group 35647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; 3627b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi// Flag for terminal PtNodes 37647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; 38647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for shortcut targets presence 39647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; 40647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for bigram presence 41647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; 42647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi// Flag for non-words (typically, shortcut only entries) 43647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagiconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; 4405172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu// Flag for possibly offensive words 4505172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicuconst PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; 46647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 479ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( 489ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi const uint8_t *const buffer, int *const pos) { 499ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 509ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi if (firstByte < 0x80) { 519ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return firstByte; 529ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } else { 539ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( 549ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi buffer, pos); 559ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } 569ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi} 579ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi 589ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( 599ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi const uint8_t *const buffer, int *const pos) { 609ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 619ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi} 629ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi 639ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, 64fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto const int *const codePointTable, int *const pos) { 65fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); 669ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi} 679ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi 689ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi// Returns the number of read characters. 699ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, 70fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto const NodeFlags flags, const int maxLength, const int *const codePointTable, 71fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto int *const outBuffer, int *const pos) { 729ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi int length = 0; 739ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi if (hasMultipleChars(flags)) { 74fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, 75fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto outBuffer, pos); 769ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } else { 77fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); 789155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi if (codePoint == NOT_A_CODE_POINT) { 799155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is 809155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR 819155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi // when the PtNode has a single code point. 829155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi length = 0; 839155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", 849155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi *pos - 1, codePoint, buffer[*pos - 1]); 859155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi ASSERT(false); 869155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi } else if (maxLength > 0) { 879155eec0d9a6749879b413a22f30ede2e170ce19Keisuke Kuroyanagi outBuffer[0] = codePoint; 889ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi length = 1; 899ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } 909ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } 919ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return length; 929ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi} 939ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi 949ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi// Returns the number of skipped characters. 959ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, 96fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto const int maxLength, const int *const codePointTable, int *const pos) { 979ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi if (hasMultipleChars(flags)) { 989ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); 999ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } else { 1009ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi if (maxLength > 0) { 101fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto getCodePointAndAdvancePosition(buffer, codePointTable, pos); 1029ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return 1; 1039ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } else { 1049ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return 0; 1059ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } 1069ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi } 1079ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi} 1089ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi 1099ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, 1109ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi int *const pos) { 1119ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 1129ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi} 1139ea9c61c99b2fc8ff9a5bbd02c0ad81a828b930cKeisuke Kuroyanagi 114647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( 115647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi const uint8_t *const buffer, const NodeFlags flags, int *const pos) { 116647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi const int base = *pos; 117647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi int offset = 0; 11827b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi switch (MASK_CHILDREN_POSITION_TYPE & flags) { 11927b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: 120647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); 121647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi break; 12227b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: 123647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); 124647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi break; 12527b12933cd4e6dcb7363f0f33f3da8d7481bf7caKeisuke Kuroyanagi case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: 126647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); 127647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi break; 128647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi default: 129647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi // If we come here, it means we asked for the children of a word with 130647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi // no children. 131647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi return NOT_A_DICT_POS; 132647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi } 133647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi return base + offset; 134647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi} 135647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi 1361e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, 1371e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi const DictionaryShortcutsStructurePolicy *const shortcutPolicy, 138fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, 1391e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, 1401e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, 1411e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi int *const outBigramPos, int *const outSiblingPos) { 1421e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi int readingPos = ptNodePos; 1431e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); 1441e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outFlags = flags; 1451e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outCodePointCount = getCharsAndAdvancePosition( 146fb2bde5a688d93aa946e3dd923aa1e99588777fcAkifumi Yoshimoto dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); 1471e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outProbability = isTerminal(flags) ? 1481e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; 1491e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outChildrenPos = hasChildrenInFlags(flags) ? 1501e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; 1511e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outShortcutPos = NOT_A_DICT_POS; 1521e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi if (hasShortcutTargets(flags)) { 1531e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outShortcutPos = readingPos; 1541e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi shortcutPolicy->skipAllShortcuts(&readingPos); 1551e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi } 1561e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outBigramPos = NOT_A_DICT_POS; 1571e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi if (hasBigrams(flags)) { 1581e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outBigramPos = readingPos; 1591e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi bigramPolicy->skipAllBigrams(&readingPos); 1601e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi } 1611e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi *outSiblingPos = readingPos; 1621e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi} 1631e2752924d921a9a2a26bf4e72e6db8d4e21982cKeisuke Kuroyanagi 164647c00070712067fc5ae415f9106be5ca4e17464Keisuke Kuroynagi} // namespace latinime 165