130088259480130e5bac5c2028e2c7c3e6d4c51a2satok/* 230088259480130e5bac5c2028e2c7c3e6d4c51a2satok * Copyright (C) 2010 The Android Open Source Project 330088259480130e5bac5c2028e2c7c3e6d4c51a2satok * 430088259480130e5bac5c2028e2c7c3e6d4c51a2satok * Licensed under the Apache License, Version 2.0 (the "License"); 530088259480130e5bac5c2028e2c7c3e6d4c51a2satok * you may not use this file except in compliance with the License. 630088259480130e5bac5c2028e2c7c3e6d4c51a2satok * You may obtain a copy of the License at 730088259480130e5bac5c2028e2c7c3e6d4c51a2satok * 830088259480130e5bac5c2028e2c7c3e6d4c51a2satok * http://www.apache.org/licenses/LICENSE-2.0 930088259480130e5bac5c2028e2c7c3e6d4c51a2satok * 1030088259480130e5bac5c2028e2c7c3e6d4c51a2satok * Unless required by applicable law or agreed to in writing, software 1130088259480130e5bac5c2028e2c7c3e6d4c51a2satok * distributed under the License is distributed on an "AS IS" BASIS, 1230088259480130e5bac5c2028e2c7c3e6d4c51a2satok * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1330088259480130e5bac5c2028e2c7c3e6d4c51a2satok * See the License for the specific language governing permissions and 1430088259480130e5bac5c2028e2c7c3e6d4c51a2satok * limitations under the License. 1530088259480130e5bac5c2028e2c7c3e6d4c51a2satok */ 1630088259480130e5bac5c2028e2c7c3e6d4c51a2satok 1730088259480130e5bac5c2028e2c7c3e6d4c51a2satok#ifndef LATINIME_BIGRAM_DICTIONARY_H 1830088259480130e5bac5c2028e2c7c3e6d4c51a2satok#define LATINIME_BIGRAM_DICTIONARY_H 1930088259480130e5bac5c2028e2c7c3e6d4c51a2satok 201ff8dc47be1734555af1c0c011ea6cf72b395a43Jean Chalard#include <map> 219c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard#include <stdint.h> 229c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard 23f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard#include "defines.h" 24f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard 2530088259480130e5bac5c2028e2c7c3e6d4c51a2satoknamespace latinime { 2630088259480130e5bac5c2028e2c7c3e6d4c51a2satok 2730088259480130e5bac5c2028e2c7c3e6d4c51a2satokclass Dictionary; 2830088259480130e5bac5c2028e2c7c3e6d4c51a2satokclass BigramDictionary { 29e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa public: 305b0761e6a94227d6ef788f589fb6edcd44ed791fJean Chalard BigramDictionary(const unsigned char *dict, int maxWordLength, Dictionary *parentDictionary); 31522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard int getBigrams(const int32_t *word, int length, int *codes, int codesSize, 326ba8de2a608dfe4865b0b59a753f2d2abbedeeffsatok unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); 33351864b38a2a19a3b591efe3ed58a5998bb4c79dJean Chalard int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength); 34f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, 35f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard std::map<int, int> *map, uint8_t *filter); 364d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2); 3730088259480130e5bac5c2028e2c7c3e6d4c51a2satok ~BigramDictionary(); 38e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa private: 3918c28f431eadc1b451ca25d14fd683db4b234838satok bool addWordBigram(unsigned short *word, int length, int frequency); 4018c28f431eadc1b451ca25d14fd683db4b234838satok int getBigramAddress(int *pos, bool advance); 4118c28f431eadc1b451ca25d14fd683db4b234838satok int getBigramFreq(int *pos); 4218c28f431eadc1b451ca25d14fd683db4b234838satok void searchForTerminalNode(int addressLookingFor, int frequency); 4318c28f431eadc1b451ca25d14fd683db4b234838satok bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; } 4418c28f431eadc1b451ca25d14fd683db4b234838satok bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; } 4518c28f431eadc1b451ca25d14fd683db4b234838satok bool checkFirstCharacter(unsigned short *word); 4618c28f431eadc1b451ca25d14fd683db4b234838satok 4718c28f431eadc1b451ca25d14fd683db4b234838satok const unsigned char *DICT; 4818c28f431eadc1b451ca25d14fd683db4b234838satok const int MAX_WORD_LENGTH; 496ba8de2a608dfe4865b0b59a753f2d2abbedeeffsatok // TODO: Re-implement proximity correction for bigram correction 506ba8de2a608dfe4865b0b59a753f2d2abbedeeffsatok static const int MAX_ALTERNATIVES = 1; 5118c28f431eadc1b451ca25d14fd683db4b234838satok 5218c28f431eadc1b451ca25d14fd683db4b234838satok Dictionary *mParentDictionary; 5318c28f431eadc1b451ca25d14fd683db4b234838satok int *mBigramFreq; 5418c28f431eadc1b451ca25d14fd683db4b234838satok int mMaxBigrams; 5518c28f431eadc1b451ca25d14fd683db4b234838satok unsigned short *mBigramChars; 5618c28f431eadc1b451ca25d14fd683db4b234838satok int *mInputCodes; 5718c28f431eadc1b451ca25d14fd683db4b234838satok int mInputLength; 5830088259480130e5bac5c2028e2c7c3e6d4c51a2satok}; 59ce9e52a12a6af8fca0eba42aaae24602fbd5c998Ken Wakasa 60ce9e52a12a6af8fca0eba42aaae24602fbd5c998Ken Wakasa} // namespace latinime 61ce9e52a12a6af8fca0eba42aaae24602fbd5c998Ken Wakasa 6230088259480130e5bac5c2028e2c7c3e6d4c51a2satok#endif // LATINIME_BIGRAM_DICTIONARY_H 63