FusionDictionary.java revision 91cbe3566d21e1ae11c1409ae62c0c0a9614dec6
1bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/* 2bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Copyright (C) 2011 The Android Open Source Project 3bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 7bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 9bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 10bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 15bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 16bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 17cdc51fc6afc7fd374c5c9eeb0539cae5cf1de724Tom Ouyangpackage com.android.inputmethod.latin.makedict; 18bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 19b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasaimport com.android.inputmethod.annotations.UsedForTesting; 20eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanadaimport com.android.inputmethod.latin.Constants; 21eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 22bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.ArrayList; 23bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Arrays; 24bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Collections; 2547cac57e4593f47e753410e4199e84e458d6de6fJean Chalardimport java.util.Date; 26c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalardimport java.util.HashMap; 27bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Iterator; 28bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.LinkedList; 29bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 30bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/** 31bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A dictionary that can fusion heads and tails of words for more compression. 32bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 33b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasa@UsedForTesting 34a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaokapublic final class FusionDictionary implements Iterable<Word> { 3547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard private static final boolean DBG = MakedictLog.DBG; 3647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard 37bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 38bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A node of the dictionary, containing several CharGroups. 39bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 40bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A node is but an ordered array of CharGroups, which essentially contain all the 41bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * real information. 42bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This class also contains fields to cache size and address, to help with binary 43bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * generation. 44bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 45a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class Node { 46bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard ArrayList<CharGroup> mData; 47bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // To help with binary generation 48e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedSize = Integer.MIN_VALUE; 4991cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 5091cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // always hold the same value except between dictionary address compression, during which 5191cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the update process needs to know about both values at the same time. Updating will 5291cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // update the AfterUpdate value, and the code will move them to BeforeUpdate before 5391cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the next update pass. 5491cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 5591cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 56e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedParentAddress = 0; 57e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada 58bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Node() { 59bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = new ArrayList<CharGroup>(); 60bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 61bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Node(ArrayList<CharGroup> data) { 62bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = data; 63bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 64bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 65bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 66bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 67bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A string with a frequency. 68bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 69bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This represents an "attribute", that is either a bigram or a shortcut. 70bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 71a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class WeightedString { 7254e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public final String mWord; 7354e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public int mFrequency; 74bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public WeightedString(String word, int frequency) { 75bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mWord = word; 76bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 77bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 789f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 799f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 809f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public int hashCode() { 819f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return Arrays.hashCode(new Object[] { mWord, mFrequency }); 829f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 839f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 849f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 859f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public boolean equals(Object o) { 869f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (o == this) return true; 879f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (!(o instanceof WeightedString)) return false; 889f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa WeightedString w = (WeightedString)o; 899f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 909f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 91bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 92bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 93bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 94eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * A group of characters, with a frequency, shortcut targets, bigrams, and children. 95bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 96bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is the central class of the in-memory representation. A CharGroup is what can 97bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * be seen as a traditional "trie node", except it can hold several characters at the 98bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * same time. A CharGroup essentially represents one or several characters in the middle 99bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * of the trie trie; as such, it can be a terminal, and it can have children. 100bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * In this in-memory representation, whether the CharGroup is a terminal or not is represented 101bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 102bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 103bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 104bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 105a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class CharGroup { 106bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static final int NOT_A_TERMINAL = -1; 107bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int mChars[]; 1087cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mShortcutTargets; 1097cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mBigrams; 1107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 111bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Node mChildren; 11272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsNotAWord; // Only a shortcut 11372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsBlacklistEntry; 114bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The two following members to help with binary generation 115bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int mCachedSize; 116bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int mCachedAddress; 117bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 118eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 11972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 12072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 121bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 122bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 123eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 124bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 125bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = null; 12672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 12772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 128bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 129bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 130eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 13172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 13272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) { 133bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 134bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 135eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 136bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 137bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = children; 13872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 13972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 140bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 141bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 142bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void addChild(CharGroup n) { 143bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == mChildren) { 144bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = new Node(); 145bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 146bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren.mData.add(n); 147bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 148bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 149bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean isTerminal() { 150bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return NOT_A_TERMINAL != mFrequency; 151bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 152bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 153b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard public int getFrequency() { 154b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard return mFrequency; 155b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard } 156b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard 157a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsNotAWord() { 158a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsNotAWord; 159a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 160a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 161a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsBlacklistEntry() { 162a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsBlacklistEntry; 163a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 164a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 165a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getShortcutTargets() { 166a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 167a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mShortcutTargets) return null; 168f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfShortcutTargets = 169f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard new ArrayList<WeightedString>(mShortcutTargets); 170a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfShortcutTargets; 171a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 172a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 173a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getBigrams() { 174a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 175a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mBigrams) return null; 176f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); 177a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfBigrams; 178a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 179a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 180bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasSeveralChars() { 181bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(mChars.length > 0); 182bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return 1 < mChars.length; 183bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 1847cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 1857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 1867cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Adds a word to the bigram list. Updates the frequency if the word already 1877cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * exists. 1887cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 1897cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void addBigram(final String word, final int frequency) { 1907cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 1917cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = new ArrayList<WeightedString>(); 1927cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 1937cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = getBigram(word); 1947cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram != null) { 1957cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram.mFrequency = frequency; 1967cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 1977cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram = new WeightedString(word, frequency); 1987cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 1997cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2007cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2017cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2027cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2037cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the shortcut target for the given word. Returns null if the word is not in the 2047cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * shortcut list. 2057cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2067cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getShortcut(final String word) { 20747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2087cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets != null) { 2097cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mShortcutTargets.size(); 2107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString shortcut = mShortcutTargets.get(i); 2127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcut.mWord.equals(word)) { 2137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return shortcut; 2147cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2157cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2167cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2177cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2187cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2207cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2217cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the bigram for the given word. 2227cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Returns null if the word is not in the bigrams list. 2237cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2247cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getBigram(final String word) { 22547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams != null) { 2277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mBigrams.size(); 2287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = mBigrams.get(i); 2307cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram.mWord.equals(word)) { 2317cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return bigram; 2327cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2337cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2347cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to 2407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 2417cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * updated if they are higher than the existing ones. 2427cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 24372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 24472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, 24572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 2467cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (frequency > mFrequency) { 2477cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mFrequency = frequency; 2487cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2497cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcutTargets != null) { 2507cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets == null) { 2517cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets = shortcutTargets; 2527cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2537cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = shortcutTargets.size(); 2547cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2557cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString shortcut = shortcutTargets.get(i); 2567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingShortcut = getShortcut(shortcut.mWord); 2577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingShortcut == null) { 2587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets.add(shortcut); 2597cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 2607cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingShortcut.mFrequency = shortcut.mFrequency; 2617cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2627cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2637cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2647cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2657cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigrams != null) { 2667cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2677cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = bigrams; 2687cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2697cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = bigrams.size(); 2707cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2717cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString bigram = bigrams.get(i); 2727cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingBigram = getBigram(bigram.mWord); 2737cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingBigram == null) { 2747cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2757cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingBigram.mFrequency < bigram.mFrequency) { 2767cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingBigram.mFrequency = bigram.mFrequency; 2777cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2787cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2797cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2807cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 28172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 28272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 2837cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 284bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 285bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 286bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 287bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Options global to the dictionary. 288bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 289a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryOptions { 290f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mGermanUmlautProcessing; 291f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mFrenchLigatureProcessing; 292f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final HashMap<String, String> mAttributes; 293f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public DictionaryOptions(final HashMap<String, String> attributes, 294f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 295c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mAttributes = attributes; 296f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mGermanUmlautProcessing = germanUmlautProcessing; 297f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mFrenchLigatureProcessing = frenchLigatureProcessing; 298c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 29947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard @Override 30047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard public String toString() { // Convenience method 30151a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard return toString(0, false); 30247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 30351a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard public String toString(final int indentCount, final boolean plumbing) { 30447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder indent = new StringBuilder(); 30551a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if (plumbing) { 30651a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append("H:"); 30751a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } else { 30851a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard for (int i = 0; i < indentCount; ++i) { 30951a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append(" "); 31051a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } 31147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 31247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder s = new StringBuilder(); 31347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard for (final String optionKey : mAttributes.keySet()) { 31447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 31547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(optionKey); 31647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(" = "); 31751a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if ("date".equals(optionKey) && !plumbing) { 31847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard // Date needs a number of milliseconds, but the dictionary contains seconds 31947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(new Date( 32047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 32147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } else { 32247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(mAttributes.get(optionKey)); 32347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 32447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("\n"); 32547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 32647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mGermanUmlautProcessing) { 32747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 32847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs German umlaut processing\n"); 32947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 33047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mFrenchLigatureProcessing) { 33147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 33247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs French ligature processing\n"); 33347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 33447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard return s.toString(); 33547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 336bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 337bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 338bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final DictionaryOptions mOptions; 339bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final Node mRoot; 340bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 341bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public FusionDictionary(final Node root, final DictionaryOptions options) { 342bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mRoot = root; 343bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mOptions = options; 344bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 345bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 346c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard public void addOptionAttribute(final String key, final String value) { 347c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mOptions.mAttributes.put(key, value); 348c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 349c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard 350bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 351bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to convert a String to an int array. 352bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 3533c6d9fe14840fd2c455ec65b6481ed78f99a5460Yuichiro Hanada static int[] getCodePoints(final String word) { 35447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray, 35547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // which is not visible from the makedict package. Factor this code. 35647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final char[] characters = word.toCharArray(); 35747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int length = characters.length; 35847db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 35947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int codePoint = Character.codePointAt(characters, 0); 36047db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int dsti = 0; 36147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard for (int srci = Character.charCount(codePoint); 36247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard srci < length; srci += Character.charCount(codePoint), ++dsti) { 36347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 36447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoint = Character.codePointAt(characters, srci); 365c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 36647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 36747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard return codePoints; 368c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 369c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard 370c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard /** 371bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to add a word as a string. 372bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 373bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method adds a word to the dictionary with the given frequency. Optional 374bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * lists of bigrams and shortcuts can be passed here. For each word inside, 375bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * they will be added to the dictionary as necessary. 376bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 377bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word to add. 378bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 379eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 38072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 381bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 382eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public void add(final String word, final int frequency, 38372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 38472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 38572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 38672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard } 38772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard 38872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard /** 38972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * Helper method to add a blacklist entry as a string. 39072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * 39172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param word the word to add as a blacklist entry. 39272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 39372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 39472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard */ 39572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void addBlacklistEntry(final String word, 39672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 39772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 398bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 399bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 400bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 401bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Sanity check for a node. 402bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 403bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method checks that all CharGroups in a node are ordered as expected. 404bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * If they are, nothing happens. If they aren't, an exception is thrown. 405bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 406bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private void checkStack(Node node) { 407bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard ArrayList<CharGroup> stack = node.mData; 408bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int lastValue = -1; 409bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 0; i < stack.size(); ++i) { 410bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int currentValue = stack.get(i).mChars[0]; 411bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentValue <= lastValue) 412bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new RuntimeException("Invalid stack"); 413bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard else 414bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard lastValue = currentValue; 415bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 416bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 417bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 418bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 4197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Helper method to add a new bigram to the dictionary. 4207cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * 4217cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word1 the previous word of the context 4227cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word2 the next word of the context 4237cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param frequency the bigram frequency 4247cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 4257cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void setBigram(final String word1, final String word2, final int frequency) { 4267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang CharGroup charGroup = findWordInTree(mRoot, word1); 4277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (charGroup != null) { 4287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final CharGroup charGroup2 = findWordInTree(mRoot, word2); 4297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (charGroup2 == null) { 43072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word2), 0, null, false /* isNotAWord */, 43172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 432ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // The chargroup for the first word may have moved by the above insertion, 433ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // if word1 and word2 share a common stem that happens not to have been 434ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // a cutting point until now. In this case, we need to refresh charGroup. 435ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard charGroup = findWordInTree(mRoot, word1); 4367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang charGroup.addBigram(word2, frequency); 4387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 4397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang throw new RuntimeException("First word of bigram not found"); 4407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4417cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4427cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 4437cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 444bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Add a word to this dictionary. 445bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 44644c64f46a143623dd793facd889c8d6eab5e230cJean Chalard * The shortcuts, if any, have to be in the dictionary already. If they aren't, 447bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * an exception is thrown. 448bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 449bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word, as an int array. 450bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 451eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 45272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 45372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 454bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 455eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard private void add(final int[] word, final int frequency, 45672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, 45772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 458bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(frequency >= 0 && frequency <= 255); 459eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada if (word.length >= Constants.Dictionary.MAX_WORD_LENGTH) { 460eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 461eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada return; 462eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada } 463eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 464bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Node currentNode = mRoot; 465bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int charIndex = 0; 466bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 467bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup currentGroup = null; 468bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int differentCharIndex = 0; // Set by the loop to the index of the char that differs 469bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int nodeIndex = findIndexOfChar(mRoot, word[charIndex]); 470bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard while (CHARACTER_NOT_FOUND != nodeIndex) { 471bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup = currentNode.mData.get(nodeIndex); 472cad25fc8a754d6f145bc846f17f270220b15c055Jean Chalard differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex); 473bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (ARRAYS_ARE_EQUAL != differentCharIndex 474bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard && differentCharIndex < currentGroup.mChars.length) break; 475bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == currentGroup.mChildren) break; 476bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard charIndex += currentGroup.mChars.length; 477bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex >= word.length) break; 478bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentNode = currentGroup.mChildren; 479bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard nodeIndex = findIndexOfChar(currentNode, word[charIndex]); 480bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 481bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 482bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (-1 == nodeIndex) { 483bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // No node at this point to accept the word. Create one. 484bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); 485bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newGroup = new CharGroup( 486eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard Arrays.copyOfRange(word, charIndex, word.length), 48772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 488bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentNode.mData.add(insertionIndex, newGroup); 48947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard if (DBG) checkStack(currentNode); 490bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 491bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // There is a word with a common prefix. 492bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (differentCharIndex == currentGroup.mChars.length) { 493bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 494bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word is a prefix of an existing word, but the node on which it 49545239029ceb876462e0d3f654c6b24ac9a9ed8afKen Wakasa // should end already exists as is. Since the old CharNode was not a terminal, 4967cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // make it one by filling in its frequency and other attributes 49772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.update(frequency, shortcutTargets, null, isNotAWord, 49872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 499bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 500bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word matches the full old word and extends past it. 501bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We only have to create a new node and add it to the end of this. 502bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newNode = new CharGroup( 503bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 50472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, 50572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 506bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup.mChildren = new Node(); 507bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup.mChildren.mData.add(newNode); 508bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 509bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 510bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (0 == differentCharIndex) { 5117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // Exact same word. Update the frequency if higher. This will also add the 51244c64f46a143623dd793facd889c8d6eab5e230cJean Chalard // new shortcuts to the existing shortcut list if it already exists. 51372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.update(frequency, shortcutTargets, null, 51472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsNotAWord && isNotAWord, 51572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsBlacklistEntry || isBlacklistEntry); 516bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 517bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Partial prefix match only. We have to replace the current node with a node 518bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // containing the current prefix and create two new ones for the tails. 519bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Node newChildren = new Node(); 520bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newOldWord = new CharGroup( 521bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, 522eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard currentGroup.mChars.length), currentGroup.mShortcutTargets, 52372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mBigrams, currentGroup.mFrequency, 52472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry, 52572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mChildren); 526bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(newOldWord); 527bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 528bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newParent; 529bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 530bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newParent = new CharGroup( 531bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 53272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 53372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry, newChildren); 534bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 535bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newParent = new CharGroup( 536bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 53745239029ceb876462e0d3f654c6b24ac9a9ed8afKen Wakasa null /* shortcutTargets */, null /* bigrams */, -1, 53872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 53944c64f46a143623dd793facd889c8d6eab5e230cJean Chalard final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, 54044c64f46a143623dd793facd889c8d6eab5e230cJean Chalard charIndex + differentCharIndex, word.length), 54172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 54272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry); 543bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int addIndex = word[charIndex + differentCharIndex] 544bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard > currentGroup.mChars[differentCharIndex] ? 1 : 0; 545bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(addIndex, newWord); 546bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 547bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentNode.mData.set(nodeIndex, newParent); 548bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 54947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard if (DBG) checkStack(currentNode); 550bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 551bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 552bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 553bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 55493ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka private static int ARRAYS_ARE_EQUAL = 0; 55593ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka 556bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 557bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Custom comparison of two int arrays taken to contain character codes. 558bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 559bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method compares the two arrays passed as an argument in a lexicographic way, 560bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * with an offset in the dst string. 561bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method does NOT test for the first character. It is taken to be equal. 562bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 563bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 564bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * strings are equal. This works BECAUSE we don't look at the first character. 565bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 566bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param src the left-hand side string of the comparison. 567bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dst the right-hand side string of the comparison. 568bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dstOffset the offset in the right-hand side string. 569bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 570bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 571bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private static int compareArrays(final int[] src, final int[] dst, int dstOffset) { 572bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We do NOT test the first char, because we come from a method that already 573bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tested it. 574bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 1; i < src.length; ++i) { 575bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dstOffset + i >= dst.length) return i; 576bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (src[i] != dst[dstOffset + i]) return i; 577bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 578bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dst.length > src.length) return src.length; 579bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return ARRAYS_ARE_EQUAL; 580bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 581bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 582bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 583bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper class that compares and sorts two chargroups according to their 584bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * first element only. I repeat: ONLY the first element is considered, the rest 585bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * is ignored. 586bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This comparator imposes orderings that are inconsistent with equals. 587bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 588a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka static private final class CharGroupComparator implements java.util.Comparator<CharGroup> { 58993ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka @Override 5902aa02b84a4fcfaf5554c278d2b25cf9414eecf8bKen Wakasa public int compare(CharGroup c1, CharGroup c2) { 591bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (c1.mChars[0] == c2.mChars[0]) return 0; 592bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return c1.mChars[0] < c2.mChars[0] ? -1 : 1; 593bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 594bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 595bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator(); 596bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 597bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 598bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Finds the insertion index of a character within a node. 599bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 600bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private static int findInsertionIndex(final Node node, int character) { 6012aa02b84a4fcfaf5554c278d2b25cf9414eecf8bKen Wakasa final ArrayList<CharGroup> data = node.mData; 60244c64f46a143623dd793facd889c8d6eab5e230cJean Chalard final CharGroup reference = new CharGroup(new int[] { character }, 60372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 60472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 605bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); 606bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return result >= 0 ? result : -result - 1; 607bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 608bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 60993ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka private static int CHARACTER_NOT_FOUND = -1; 61093ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka 611bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 612bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Find the index of a char in a node, if it exists. 613bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 614bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param node the node to search in. 615bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param character the character to search for. 616bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else. 617bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 618bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private static int findIndexOfChar(final Node node, int character) { 619bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int insertionIndex = findInsertionIndex(node, character); 620bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND; 621bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex 622bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard : CHARACTER_NOT_FOUND; 623bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 624bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 625bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 626bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to find a word in a given branch. 627bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 6282da886651874b2588f18f800417ba858ac93d88bJean Chalard @SuppressWarnings("unused") 629a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard public static CharGroup findWordInTree(Node node, final String string) { 630bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int index = 0; 63112efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard final StringBuilder checker = DBG ? new StringBuilder() : null; 632a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard final int[] codePoints = getCodePoints(string); 633bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 634bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup currentGroup; 635bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 636a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard int indexOfGroup = findIndexOfChar(node, codePoints[index]); 637bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (CHARACTER_NOT_FOUND == indexOfGroup) return null; 638bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup = node.mData.get(indexOfGroup); 63966f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 640a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (codePoints.length - index < currentGroup.mChars.length) return null; 64166f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada int newIndex = index; 642a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard while (newIndex < codePoints.length && newIndex - index < currentGroup.mChars.length) { 643a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (currentGroup.mChars[newIndex - index] != codePoints[newIndex]) return null; 64466f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada newIndex++; 64566f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada } 64666f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada index = newIndex; 64766f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 64812efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length)); 649a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) { 650bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard node = currentGroup.mChildren; 651bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 652a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard } while (null != node && index < codePoints.length); 653bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 654a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) return null; 6550d35c159fefd7591c2ab9d5037c32d1804024197Yuichiro Hanada if (!currentGroup.isTerminal()) return null; 656ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (DBG && !string.equals(checker.toString())) return null; 657bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return currentGroup; 658bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 659bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 660bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 661eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard * Helper method to find out whether a word is in the dict or not. 662eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard */ 663eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard public boolean hasWord(final String s) { 664eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard if (null == s || "".equals(s)) { 665eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard throw new RuntimeException("Can't search for a null or empty string"); 666eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 667eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard return null != findWordInTree(mRoot, s); 668eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 669eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard 670eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard /** 671bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Recursively count the number of character groups in a given branch of the trie. 672bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 673bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param node the parent node. 674bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the number of char groups in all the branch under this node. 675bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 676bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static int countCharGroups(final Node node) { 677bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int nodeSize = node.mData.size(); 678bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = nodeSize; 679bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = nodeSize - 1; i >= 0; --i) { 680bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup group = node.mData.get(i); 681bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null != group.mChildren) 682bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard size += countCharGroups(group.mChildren); 683bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 684bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 685bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 686bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 687bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 688bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Recursively count the number of nodes in a given branch of the trie. 689bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 690bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param node the node to count. 69120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return the number of nodes in this branch. 692bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 693bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static int countNodes(final Node node) { 694bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = 1; 695bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = node.mData.size() - 1; i >= 0; --i) { 696bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup group = node.mData.get(i); 697bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null != group.mChildren) 698bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard size += countNodes(group.mChildren); 699bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 700bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 701bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 702bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 70320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // Recursively find out whether there are any bigrams. 70420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // This can be pretty expensive especially if there aren't any (we return as soon 70520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // as we find one, so it's much cheaper if there are bigrams) 70620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard private static boolean hasBigramsInternal(final Node node) { 70720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard if (null == node) return false; 70820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard for (int i = node.mData.size() - 1; i >= 0; --i) { 70920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard CharGroup group = node.mData.get(i); 71020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard if (null != group.mBigrams) return true; 71120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard if (hasBigramsInternal(group.mChildren)) return true; 71220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 71320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard return false; 71420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 71520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 71620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard /** 71720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * Finds out whether there are any bigrams in this dictionary. 71820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * 71920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return true if there is any bigram, false otherwise. 72020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard */ 72120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // TODO: this is expensive especially for large dictionaries without any bigram. 72220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // The up side is, this is always accurate and correct and uses no memory. We should 72320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // find a more efficient way of doing this, without compromising too much on memory 72420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // and ease of use. 72520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard public boolean hasBigrams() { 72620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard return hasBigramsInternal(mRoot); 72720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 72820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 729bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Historically, the tails of the words were going to be merged to save space. 730bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // However, that would prevent the code to search for a specific address in log(n) 731bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // time so this was abandoned. 732bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The code is still of interest as it does add some compression to any dictionary 733bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // that has no need for attributes. Implementations that does not read attributes should be 734bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // able to read a dictionary with merged tails. 735bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Also, the following code does support frequencies, as in, it will only merges 736bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that share the same frequency. Though it would result in the above loss of 737bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // performance while searching by address, it is still technically possible to merge 738bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that contain attributes, but this code does not take that into account - it does 739bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // not compare attributes and will merge terminals with different attributes regardless. 740bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void mergeTails() { 741bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard MakedictLog.i("Do not merge tails"); 742bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return; 743bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 744bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root)); 745bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of groups : " + countCharGroups(root)); 746bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 747bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final HashMap<String, ArrayList<Node>> repository = 748bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// new HashMap<String, ArrayList<Node>>(); 749bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// mergeTailsInner(repository, root); 750bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 751bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of different pseudohashes : " + repository.size()); 752bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// int size = 0; 753bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (ArrayList<Node> a : repository.values()) { 754bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// size += a.size(); 755bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 756bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of nodes after merge : " + (1 + size)); 757bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 758bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 759bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 760bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The following methods are used by the deactivated mergeTails() 761bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// private static boolean isEqual(Node a, Node b) { 762bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a && null == b) return true; 763bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a || null == b) return false; 764bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (a.data.size() != b.data.size()) return false; 765bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int size = a.data.size(); 766bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = size - 1; i >= 0; --i) { 767bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// CharGroup aGroup = a.data.get(i); 768bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// CharGroup bGroup = b.data.get(i); 769bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (aGroup.frequency != bGroup.frequency) return false; 770bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (aGroup.alternates == null && bGroup.alternates != null) return false; 771bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false; 772bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false; 773bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!isEqual(aGroup.children, bGroup.children)) return false; 774bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 775bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return true; 776bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 777bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 778bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// static private HashMap<String, ArrayList<Node>> mergeTailsInner( 779bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final HashMap<String, ArrayList<Node>> map, final Node node) { 780bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final ArrayList<CharGroup> branches = node.data; 781bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int nodeSize = branches.size(); 782bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = 0; i < nodeSize; ++i) { 783bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// CharGroup group = branches.get(i); 784bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null != group.children) { 785bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// String pseudoHash = getPseudoHash(group.children); 786bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// ArrayList<Node> similarList = map.get(pseudoHash); 787bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == similarList) { 788bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// similarList = new ArrayList<Node>(); 789bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// map.put(pseudoHash, similarList); 790bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 791bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// boolean merged = false; 792bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (Node similar : similarList) { 793bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (isEqual(group.children, similar)) { 794bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// group.children = similar; 795bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// merged = true; 796bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// break; 797bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 798bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 799bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!merged) { 800bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// similarList.add(group.children); 801bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 802bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// mergeTailsInner(map, group.children); 803bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 804bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 805bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return map; 806bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 807bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 808bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// private static String getPseudoHash(final Node node) { 809bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// StringBuilder s = new StringBuilder(); 810bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (CharGroup g : node.data) { 811bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// s.append(g.frequency); 812f2789819bd005b5b0581e8439601b5501306327dKen Wakasa// for (int ch : g.chars) { 813bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// s.append(Character.toChars(ch)); 814bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 815bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 816bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return s.toString(); 817bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 818bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 819bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 820bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Iterator to walk through a dictionary. 821bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 822bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is purely for convenience. 823bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 824a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryIterator implements Iterator<Word> { 825a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka private static final class Position { 826bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Iterator<CharGroup> pos; 827bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public int length; 828bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Position(ArrayList<CharGroup> groups) { 829bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard pos = groups.iterator(); 830bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard length = 0; 831bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 832bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 833bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final StringBuilder mCurrentString; 834bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final LinkedList<Position> mPositions; 835bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 836bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public DictionaryIterator(ArrayList<CharGroup> root) { 837bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString = new StringBuilder(); 838bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions = new LinkedList<Position>(); 839bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final Position rootPos = new Position(root); 840bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.add(rootPos); 841bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 842bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 843bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 844bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasNext() { 845bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (Position p : mPositions) { 846bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (p.pos.hasNext()) { 847bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return true; 848bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 849bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 850bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return false; 851bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 852bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 853bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 854bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Word next() { 855bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Position currentPos = mPositions.getLast(); 856a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(currentPos.length); 857bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 858bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 859bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentPos.pos.hasNext()) { 860bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup currentGroup = currentPos.pos.next(); 861a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard currentPos.length = mCurrentString.length(); 862ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard for (int i : currentGroup.mChars) { 863bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString.append(Character.toChars(i)); 864ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 865bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null != currentGroup.mChildren) { 866bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = new Position(currentGroup.mChildren.mData); 867ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard currentPos.length = mCurrentString.length(); 868bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.addLast(currentPos); 869bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 870ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (currentGroup.mFrequency >= 0) { 871bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return new Word(mCurrentString.toString(), currentGroup.mFrequency, 87272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mShortcutTargets, currentGroup.mBigrams, 87372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry); 874ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 875bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 876bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.removeLast(); 877bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = mPositions.getLast(); 878a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(mPositions.getLast().length); 879bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 880f2789819bd005b5b0581e8439601b5501306327dKen Wakasa } while (true); 881bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 882bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 883bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 884bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void remove() { 885bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new UnsupportedOperationException("Unsupported yet"); 886bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 887bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 888bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 889bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 890bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 891bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Method to return an iterator. 892bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 893bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 894bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * and say : for (Word w : x) {} 895bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 896bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 897bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Iterator<Word> iterator() { 898bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return new DictionaryIterator(mRoot.mData); 899bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 900bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard} 901