130088259480130e5bac5c2028e2c7c3e6d4c51a2satok/*
230088259480130e5bac5c2028e2c7c3e6d4c51a2satok * Copyright (C) 2010 The Android Open Source Project
330088259480130e5bac5c2028e2c7c3e6d4c51a2satok *
430088259480130e5bac5c2028e2c7c3e6d4c51a2satok * Licensed under the Apache License, Version 2.0 (the "License");
530088259480130e5bac5c2028e2c7c3e6d4c51a2satok * you may not use this file except in compliance with the License.
630088259480130e5bac5c2028e2c7c3e6d4c51a2satok * You may obtain a copy of the License at
730088259480130e5bac5c2028e2c7c3e6d4c51a2satok *
830088259480130e5bac5c2028e2c7c3e6d4c51a2satok *      http://www.apache.org/licenses/LICENSE-2.0
930088259480130e5bac5c2028e2c7c3e6d4c51a2satok *
1030088259480130e5bac5c2028e2c7c3e6d4c51a2satok * Unless required by applicable law or agreed to in writing, software
1130088259480130e5bac5c2028e2c7c3e6d4c51a2satok * distributed under the License is distributed on an "AS IS" BASIS,
1230088259480130e5bac5c2028e2c7c3e6d4c51a2satok * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1330088259480130e5bac5c2028e2c7c3e6d4c51a2satok * See the License for the specific language governing permissions and
1430088259480130e5bac5c2028e2c7c3e6d4c51a2satok * limitations under the License.
1530088259480130e5bac5c2028e2c7c3e6d4c51a2satok */
1630088259480130e5bac5c2028e2c7c3e6d4c51a2satok
1730088259480130e5bac5c2028e2c7c3e6d4c51a2satok#ifndef LATINIME_BIGRAM_DICTIONARY_H
1830088259480130e5bac5c2028e2c7c3e6d4c51a2satok#define LATINIME_BIGRAM_DICTIONARY_H
1930088259480130e5bac5c2028e2c7c3e6d4c51a2satok
201ff8dc47be1734555af1c0c011ea6cf72b395a43Jean Chalard#include <map>
219c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard#include <stdint.h>
229c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard
23f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard#include "defines.h"
24f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard
2530088259480130e5bac5c2028e2c7c3e6d4c51a2satoknamespace latinime {
2630088259480130e5bac5c2028e2c7c3e6d4c51a2satok
2730088259480130e5bac5c2028e2c7c3e6d4c51a2satokclass Dictionary;
2830088259480130e5bac5c2028e2c7c3e6d4c51a2satokclass BigramDictionary {
29e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa public:
305b0761e6a94227d6ef788f589fb6edcd44ed791fJean Chalard    BigramDictionary(const unsigned char *dict, int maxWordLength, Dictionary *parentDictionary);
31522a04ea5b249d0af556647d2abcad57e5b99b4fJean Chalard    int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
326ba8de2a608dfe4865b0b59a753f2d2abbedeeffsatok            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams);
33351864b38a2a19a3b591efe3ed58a5998bb4c79dJean Chalard    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength);
34f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard    void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
35f1634c872c57a5e8d0a861cda299fdbd98740e79Jean Chalard            std::map<int, int> *map, uint8_t *filter);
364d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang    bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2);
3730088259480130e5bac5c2028e2c7c3e6d4c51a2satok    ~BigramDictionary();
38e12e9b5b69e6242af61ee690a81bedde1bdd4936Ken Wakasa private:
3918c28f431eadc1b451ca25d14fd683db4b234838satok    bool addWordBigram(unsigned short *word, int length, int frequency);
4018c28f431eadc1b451ca25d14fd683db4b234838satok    int getBigramAddress(int *pos, bool advance);
4118c28f431eadc1b451ca25d14fd683db4b234838satok    int getBigramFreq(int *pos);
4218c28f431eadc1b451ca25d14fd683db4b234838satok    void searchForTerminalNode(int addressLookingFor, int frequency);
4318c28f431eadc1b451ca25d14fd683db4b234838satok    bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; }
4418c28f431eadc1b451ca25d14fd683db4b234838satok    bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; }
4518c28f431eadc1b451ca25d14fd683db4b234838satok    bool checkFirstCharacter(unsigned short *word);
4618c28f431eadc1b451ca25d14fd683db4b234838satok
4718c28f431eadc1b451ca25d14fd683db4b234838satok    const unsigned char *DICT;
4818c28f431eadc1b451ca25d14fd683db4b234838satok    const int MAX_WORD_LENGTH;
496ba8de2a608dfe4865b0b59a753f2d2abbedeeffsatok    // TODO: Re-implement proximity correction for bigram correction
506ba8de2a608dfe4865b0b59a753f2d2abbedeeffsatok    static const int MAX_ALTERNATIVES = 1;
5118c28f431eadc1b451ca25d14fd683db4b234838satok
5218c28f431eadc1b451ca25d14fd683db4b234838satok    Dictionary *mParentDictionary;
5318c28f431eadc1b451ca25d14fd683db4b234838satok    int *mBigramFreq;
5418c28f431eadc1b451ca25d14fd683db4b234838satok    int mMaxBigrams;
5518c28f431eadc1b451ca25d14fd683db4b234838satok    unsigned short *mBigramChars;
5618c28f431eadc1b451ca25d14fd683db4b234838satok    int *mInputCodes;
5718c28f431eadc1b451ca25d14fd683db4b234838satok    int mInputLength;
5830088259480130e5bac5c2028e2c7c3e6d4c51a2satok};
59ce9e52a12a6af8fca0eba42aaae24602fbd5c998Ken Wakasa
60ce9e52a12a6af8fca0eba42aaae24602fbd5c998Ken Wakasa} // namespace latinime
61ce9e52a12a6af8fca0eba42aaae24602fbd5c998Ken Wakasa
6230088259480130e5bac5c2028e2c7c3e6d4c51a2satok#endif // LATINIME_BIGRAM_DICTIONARY_H
63