FusionDictionary.java revision 93445b4821e9e8ecc7dd52f1a5d5316c7eec2654
1bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/* 2bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Copyright (C) 2011 The Android Open Source Project 3bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 7bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 9bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 10bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 15bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 16bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 17cdc51fc6afc7fd374c5c9eeb0539cae5cf1de724Tom Ouyangpackage com.android.inputmethod.latin.makedict; 18bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 19b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasaimport com.android.inputmethod.annotations.UsedForTesting; 20eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanadaimport com.android.inputmethod.latin.Constants; 21eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 22bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.ArrayList; 23bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Arrays; 24bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Collections; 2547cac57e4593f47e753410e4199e84e458d6de6fJean Chalardimport java.util.Date; 26c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalardimport java.util.HashMap; 27bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Iterator; 28bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.LinkedList; 29bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 30bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/** 31bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A dictionary that can fusion heads and tails of words for more compression. 32bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 33b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasa@UsedForTesting 34a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaokapublic final class FusionDictionary implements Iterable<Word> { 3547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard private static final boolean DBG = MakedictLog.DBG; 3647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard 3793445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard private static int CHARACTER_NOT_FOUND_INDEX = -1; 3893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard 39bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 40bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A node of the dictionary, containing several CharGroups. 41bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 42bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A node is but an ordered array of CharGroups, which essentially contain all the 43bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * real information. 44bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This class also contains fields to cache size and address, to help with binary 45bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * generation. 46bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 47a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class Node { 48bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard ArrayList<CharGroup> mData; 49bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // To help with binary generation 50e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedSize = Integer.MIN_VALUE; 5191cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 5291cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // always hold the same value except between dictionary address compression, during which 5391cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the update process needs to know about both values at the same time. Updating will 5491cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // update the AfterUpdate value, and the code will move them to BeforeUpdate before 5591cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the next update pass. 5691cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 5791cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 58e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedParentAddress = 0; 59e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada 60bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Node() { 61bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = new ArrayList<CharGroup>(); 62bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 63bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Node(ArrayList<CharGroup> data) { 64bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = data; 65bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 66bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 67bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 68bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 69bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A string with a frequency. 70bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 71bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This represents an "attribute", that is either a bigram or a shortcut. 72bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 73a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class WeightedString { 7454e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public final String mWord; 7554e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public int mFrequency; 76bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public WeightedString(String word, int frequency) { 77bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mWord = word; 78bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 79bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 809f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 819f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 829f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public int hashCode() { 839f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return Arrays.hashCode(new Object[] { mWord, mFrequency }); 849f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 859f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 869f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 879f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public boolean equals(Object o) { 889f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (o == this) return true; 899f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (!(o instanceof WeightedString)) return false; 909f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa WeightedString w = (WeightedString)o; 919f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 929f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 93bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 94bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 95bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 96eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * A group of characters, with a frequency, shortcut targets, bigrams, and children. 97bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 98bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is the central class of the in-memory representation. A CharGroup is what can 99bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * be seen as a traditional "trie node", except it can hold several characters at the 100bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * same time. A CharGroup essentially represents one or several characters in the middle 101bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * of the trie trie; as such, it can be a terminal, and it can have children. 102bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * In this in-memory representation, whether the CharGroup is a terminal or not is represented 103bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 104bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 105bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 106bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 107a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class CharGroup { 108bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static final int NOT_A_TERMINAL = -1; 109bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int mChars[]; 1107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mShortcutTargets; 1117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mBigrams; 1127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 113bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Node mChildren; 11472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsNotAWord; // Only a shortcut 11572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsBlacklistEntry; 11625de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 11725de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // generation. Before and After always hold the same value except during dictionary 11825de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // address compression, where the update process needs to know about both values at the 11925de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // same time. Updating will update the AfterUpdate value, and the code will move them 12025de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // to BeforeUpdate before the next update pass. 12125de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // The update process does not need two versions of mCachedSize. 12225de86a6a26232964937872f203f065dd9f9faa4Jean Chalard int mCachedSize; // The size, in bytes, of this char group. 12325de86a6a26232964937872f203f065dd9f9faa4Jean Chalard int mCachedAddressBeforeUpdate; // The address of this char group (before update) 12425de86a6a26232964937872f203f065dd9f9faa4Jean Chalard int mCachedAddressAfterUpdate; // The address of this char group (after update) 125bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 126eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 12772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 12872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 129bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 130bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 131eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 132bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 133bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = null; 13472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 13572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 136bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 137bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 138eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 13972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 14072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) { 141bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 142bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 143eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 144bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 145bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = children; 14672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 14772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 148bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 149bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 150bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void addChild(CharGroup n) { 151bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == mChildren) { 152bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = new Node(); 153bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 154bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren.mData.add(n); 155bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 156bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 157bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean isTerminal() { 158bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return NOT_A_TERMINAL != mFrequency; 159bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 160bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 161b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard public int getFrequency() { 162b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard return mFrequency; 163b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard } 164b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard 165a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsNotAWord() { 166a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsNotAWord; 167a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 168a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 169a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsBlacklistEntry() { 170a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsBlacklistEntry; 171a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 172a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 173a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getShortcutTargets() { 174a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 175a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mShortcutTargets) return null; 176f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfShortcutTargets = 177f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard new ArrayList<WeightedString>(mShortcutTargets); 178a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfShortcutTargets; 179a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 180a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 181a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getBigrams() { 182a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 183a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mBigrams) return null; 184f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); 185a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfBigrams; 186a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 187a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 188bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasSeveralChars() { 189bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(mChars.length > 0); 190bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return 1 < mChars.length; 191bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 1927cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 1937cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 1947cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Adds a word to the bigram list. Updates the frequency if the word already 1957cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * exists. 1967cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 1977cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void addBigram(final String word, final int frequency) { 1987cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 1997cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = new ArrayList<WeightedString>(); 2007cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2017cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = getBigram(word); 2027cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram != null) { 2037cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram.mFrequency = frequency; 2047cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2057cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram = new WeightedString(word, frequency); 2067cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2077cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2087cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2097cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the shortcut target for the given word. Returns null if the word is not in the 2127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * shortcut list. 2137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2147cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getShortcut(final String word) { 21547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2167cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets != null) { 2177cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mShortcutTargets.size(); 2187cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString shortcut = mShortcutTargets.get(i); 2207cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcut.mWord.equals(word)) { 2217cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return shortcut; 2227cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2237cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2247cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2257cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the bigram for the given word. 2307cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Returns null if the word is not in the bigrams list. 2317cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2327cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getBigram(final String word) { 23347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2347cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams != null) { 2357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mBigrams.size(); 2367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = mBigrams.get(i); 2387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram.mWord.equals(word)) { 2397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return bigram; 2407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2417cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2427cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2437cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2447cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2457cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2467cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2477cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to 2487cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 2497cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * updated if they are higher than the existing ones. 2507cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 25172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 25272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, 25372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 2547cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (frequency > mFrequency) { 2557cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mFrequency = frequency; 2567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcutTargets != null) { 2587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets == null) { 2597cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets = shortcutTargets; 2607cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2617cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = shortcutTargets.size(); 2627cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2637cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString shortcut = shortcutTargets.get(i); 2647cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingShortcut = getShortcut(shortcut.mWord); 2657cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingShortcut == null) { 2667cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets.add(shortcut); 2677cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 2687cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingShortcut.mFrequency = shortcut.mFrequency; 2697cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2707cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2717cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2727cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2737cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigrams != null) { 2747cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2757cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = bigrams; 2767cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2777cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = bigrams.size(); 2787cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2797cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString bigram = bigrams.get(i); 2807cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingBigram = getBigram(bigram.mWord); 2817cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingBigram == null) { 2827cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2837cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingBigram.mFrequency < bigram.mFrequency) { 2847cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingBigram.mFrequency = bigram.mFrequency; 2857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2867cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2877cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2887cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 28972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 29072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 2917cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 292bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 293bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 294bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 295bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Options global to the dictionary. 296bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 297a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryOptions { 298f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mGermanUmlautProcessing; 299f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mFrenchLigatureProcessing; 300f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final HashMap<String, String> mAttributes; 301f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public DictionaryOptions(final HashMap<String, String> attributes, 302f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 303c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mAttributes = attributes; 304f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mGermanUmlautProcessing = germanUmlautProcessing; 305f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mFrenchLigatureProcessing = frenchLigatureProcessing; 306c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 30747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard @Override 30847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard public String toString() { // Convenience method 30951a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard return toString(0, false); 31047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 31151a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard public String toString(final int indentCount, final boolean plumbing) { 31247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder indent = new StringBuilder(); 31351a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if (plumbing) { 31451a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append("H:"); 31551a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } else { 31651a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard for (int i = 0; i < indentCount; ++i) { 31751a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append(" "); 31851a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } 31947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 32047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder s = new StringBuilder(); 32147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard for (final String optionKey : mAttributes.keySet()) { 32247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 32347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(optionKey); 32447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(" = "); 32551a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if ("date".equals(optionKey) && !plumbing) { 32647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard // Date needs a number of milliseconds, but the dictionary contains seconds 32747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(new Date( 32847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 32947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } else { 33047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(mAttributes.get(optionKey)); 33147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 33247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("\n"); 33347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 33447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mGermanUmlautProcessing) { 33547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 33647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs German umlaut processing\n"); 33747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 33847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mFrenchLigatureProcessing) { 33947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 34047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs French ligature processing\n"); 34147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard return s.toString(); 34347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 344bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 345bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 346bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final DictionaryOptions mOptions; 347bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final Node mRoot; 348bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 349bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public FusionDictionary(final Node root, final DictionaryOptions options) { 350bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mRoot = root; 351bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mOptions = options; 352bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 353bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 354c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard public void addOptionAttribute(final String key, final String value) { 355c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mOptions.mAttributes.put(key, value); 356c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 357c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard 358bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 359bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to convert a String to an int array. 360bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 3613c6d9fe14840fd2c455ec65b6481ed78f99a5460Yuichiro Hanada static int[] getCodePoints(final String word) { 36247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray, 36347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // which is not visible from the makedict package. Factor this code. 36447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final char[] characters = word.toCharArray(); 36547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int length = characters.length; 36647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 36747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int codePoint = Character.codePointAt(characters, 0); 36847db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int dsti = 0; 36947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard for (int srci = Character.charCount(codePoint); 37047db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard srci < length; srci += Character.charCount(codePoint), ++dsti) { 37147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 37247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoint = Character.codePointAt(characters, srci); 373c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 37447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 37547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard return codePoints; 376c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 377c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard 378c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard /** 379bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to add a word as a string. 380bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 381bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method adds a word to the dictionary with the given frequency. Optional 382bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * lists of bigrams and shortcuts can be passed here. For each word inside, 383bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * they will be added to the dictionary as necessary. 384bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 385bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word to add. 386bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 387eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 38872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 389bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 390eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public void add(final String word, final int frequency, 39172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 39272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 39372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 39472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard } 39572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard 39672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard /** 39772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * Helper method to add a blacklist entry as a string. 39872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * 39972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param word the word to add as a blacklist entry. 40072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 40172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 40272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard */ 40372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void addBlacklistEntry(final String word, 40472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 40572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 406bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 407bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 408bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 409bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Sanity check for a node. 410bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 411bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method checks that all CharGroups in a node are ordered as expected. 412bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * If they are, nothing happens. If they aren't, an exception is thrown. 413bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 414bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private void checkStack(Node node) { 415bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard ArrayList<CharGroup> stack = node.mData; 416bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int lastValue = -1; 417bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 0; i < stack.size(); ++i) { 418bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int currentValue = stack.get(i).mChars[0]; 419bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentValue <= lastValue) 420bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new RuntimeException("Invalid stack"); 421bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard else 422bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard lastValue = currentValue; 423bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 424bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 425bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 426bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 4277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Helper method to add a new bigram to the dictionary. 4287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * 4297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word1 the previous word of the context 4307cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word2 the next word of the context 4317cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param frequency the bigram frequency 4327cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 4337cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void setBigram(final String word1, final String word2, final int frequency) { 4347cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang CharGroup charGroup = findWordInTree(mRoot, word1); 4357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (charGroup != null) { 4367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final CharGroup charGroup2 = findWordInTree(mRoot, word2); 4377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (charGroup2 == null) { 43872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word2), 0, null, false /* isNotAWord */, 43972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 440ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // The chargroup for the first word may have moved by the above insertion, 441ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // if word1 and word2 share a common stem that happens not to have been 442ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // a cutting point until now. In this case, we need to refresh charGroup. 443ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard charGroup = findWordInTree(mRoot, word1); 4447cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4457cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang charGroup.addBigram(word2, frequency); 4467cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 4477cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang throw new RuntimeException("First word of bigram not found"); 4487cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4497cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4507cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 4517cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 452bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Add a word to this dictionary. 453bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 45444c64f46a143623dd793facd889c8d6eab5e230cJean Chalard * The shortcuts, if any, have to be in the dictionary already. If they aren't, 455bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * an exception is thrown. 456bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 457bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word, as an int array. 458bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 459eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 46072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 46172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 462bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 463eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard private void add(final int[] word, final int frequency, 46472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, 46572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 466bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(frequency >= 0 && frequency <= 255); 467ffcbbaf12788a9fc9398607a548e552d7d2bf05eSatoshi Kataoka if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { 468eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 469eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada return; 470eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada } 471eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 472bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Node currentNode = mRoot; 473bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int charIndex = 0; 474bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 475bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup currentGroup = null; 476bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int differentCharIndex = 0; // Set by the loop to the index of the char that differs 477bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int nodeIndex = findIndexOfChar(mRoot, word[charIndex]); 47893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 479bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup = currentNode.mData.get(nodeIndex); 480cad25fc8a754d6f145bc846f17f270220b15c055Jean Chalard differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex); 481bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (ARRAYS_ARE_EQUAL != differentCharIndex 482bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard && differentCharIndex < currentGroup.mChars.length) break; 483bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == currentGroup.mChildren) break; 484bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard charIndex += currentGroup.mChars.length; 485bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex >= word.length) break; 486bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentNode = currentGroup.mChildren; 487bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard nodeIndex = findIndexOfChar(currentNode, word[charIndex]); 488bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 489bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 49093445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 491bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // No node at this point to accept the word. Create one. 492bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); 493bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newGroup = new CharGroup( 494eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard Arrays.copyOfRange(word, charIndex, word.length), 49572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 496bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentNode.mData.add(insertionIndex, newGroup); 49747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard if (DBG) checkStack(currentNode); 498bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 499bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // There is a word with a common prefix. 500bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (differentCharIndex == currentGroup.mChars.length) { 501bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 502bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word is a prefix of an existing word, but the node on which it 50345239029ceb876462e0d3f654c6b24ac9a9ed8afKen Wakasa // should end already exists as is. Since the old CharNode was not a terminal, 5047cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // make it one by filling in its frequency and other attributes 50572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.update(frequency, shortcutTargets, null, isNotAWord, 50672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 507bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 508bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word matches the full old word and extends past it. 509bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We only have to create a new node and add it to the end of this. 510bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newNode = new CharGroup( 511bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 51272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, 51372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 514bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup.mChildren = new Node(); 515bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup.mChildren.mData.add(newNode); 516bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 517bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 518bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (0 == differentCharIndex) { 5197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // Exact same word. Update the frequency if higher. This will also add the 52044c64f46a143623dd793facd889c8d6eab5e230cJean Chalard // new shortcuts to the existing shortcut list if it already exists. 52172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.update(frequency, shortcutTargets, null, 52272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsNotAWord && isNotAWord, 52372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsBlacklistEntry || isBlacklistEntry); 524bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 525bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Partial prefix match only. We have to replace the current node with a node 526bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // containing the current prefix and create two new ones for the tails. 527bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Node newChildren = new Node(); 528bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newOldWord = new CharGroup( 529bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, 530eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard currentGroup.mChars.length), currentGroup.mShortcutTargets, 53172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mBigrams, currentGroup.mFrequency, 53272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry, 53372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mChildren); 534bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(newOldWord); 535bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 536bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup newParent; 537bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 538bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newParent = new CharGroup( 539bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 54072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 54172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry, newChildren); 542bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 543bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newParent = new CharGroup( 544bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 54545239029ceb876462e0d3f654c6b24ac9a9ed8afKen Wakasa null /* shortcutTargets */, null /* bigrams */, -1, 54672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 54744c64f46a143623dd793facd889c8d6eab5e230cJean Chalard final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, 54844c64f46a143623dd793facd889c8d6eab5e230cJean Chalard charIndex + differentCharIndex, word.length), 54972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 55072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry); 551bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int addIndex = word[charIndex + differentCharIndex] 552bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard > currentGroup.mChars[differentCharIndex] ? 1 : 0; 553bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(addIndex, newWord); 554bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 555bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentNode.mData.set(nodeIndex, newParent); 556bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 55747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard if (DBG) checkStack(currentNode); 558bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 559bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 560bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 561bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 56293ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka private static int ARRAYS_ARE_EQUAL = 0; 56393ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka 564bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 565bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Custom comparison of two int arrays taken to contain character codes. 566bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 567bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method compares the two arrays passed as an argument in a lexicographic way, 568bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * with an offset in the dst string. 569bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method does NOT test for the first character. It is taken to be equal. 570bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 571bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 572bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * strings are equal. This works BECAUSE we don't look at the first character. 573bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 574bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param src the left-hand side string of the comparison. 575bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dst the right-hand side string of the comparison. 576bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dstOffset the offset in the right-hand side string. 577bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 578bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 579bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private static int compareArrays(final int[] src, final int[] dst, int dstOffset) { 580bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We do NOT test the first char, because we come from a method that already 581bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tested it. 582bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 1; i < src.length; ++i) { 583bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dstOffset + i >= dst.length) return i; 584bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (src[i] != dst[dstOffset + i]) return i; 585bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 586bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dst.length > src.length) return src.length; 587bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return ARRAYS_ARE_EQUAL; 588bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 589bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 590bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 591bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper class that compares and sorts two chargroups according to their 592bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * first element only. I repeat: ONLY the first element is considered, the rest 593bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * is ignored. 594bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This comparator imposes orderings that are inconsistent with equals. 595bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 596a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka static private final class CharGroupComparator implements java.util.Comparator<CharGroup> { 59793ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka @Override 5982aa02b84a4fcfaf5554c278d2b25cf9414eecf8bKen Wakasa public int compare(CharGroup c1, CharGroup c2) { 599bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (c1.mChars[0] == c2.mChars[0]) return 0; 600bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return c1.mChars[0] < c2.mChars[0] ? -1 : 1; 601bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 602bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 603bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator(); 604bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 605bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 606bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Finds the insertion index of a character within a node. 607bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 608bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private static int findInsertionIndex(final Node node, int character) { 6092aa02b84a4fcfaf5554c278d2b25cf9414eecf8bKen Wakasa final ArrayList<CharGroup> data = node.mData; 61044c64f46a143623dd793facd889c8d6eab5e230cJean Chalard final CharGroup reference = new CharGroup(new int[] { character }, 61172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 61272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 613bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); 614bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return result >= 0 ? result : -result - 1; 615bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 616bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 617bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 618bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Find the index of a char in a node, if it exists. 619bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 620bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param node the node to search in. 621bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param character the character to search for. 62293445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 623bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 624bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard private static int findIndexOfChar(final Node node, int character) { 625bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int insertionIndex = findInsertionIndex(node, character); 62693445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 627bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex 62893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard : CHARACTER_NOT_FOUND_INDEX; 629bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 630bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 631bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 632bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to find a word in a given branch. 633bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 6342da886651874b2588f18f800417ba858ac93d88bJean Chalard @SuppressWarnings("unused") 635a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard public static CharGroup findWordInTree(Node node, final String string) { 636bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int index = 0; 63712efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard final StringBuilder checker = DBG ? new StringBuilder() : null; 638a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard final int[] codePoints = getCodePoints(string); 639bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 640bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup currentGroup; 641bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 642a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard int indexOfGroup = findIndexOfChar(node, codePoints[index]); 64393445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 644bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentGroup = node.mData.get(indexOfGroup); 64566f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 646a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (codePoints.length - index < currentGroup.mChars.length) return null; 64766f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada int newIndex = index; 648a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard while (newIndex < codePoints.length && newIndex - index < currentGroup.mChars.length) { 649a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (currentGroup.mChars[newIndex - index] != codePoints[newIndex]) return null; 65066f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada newIndex++; 65166f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada } 65266f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada index = newIndex; 65366f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 65412efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length)); 655a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) { 656bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard node = currentGroup.mChildren; 657bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 658a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard } while (null != node && index < codePoints.length); 659bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 660a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) return null; 6610d35c159fefd7591c2ab9d5037c32d1804024197Yuichiro Hanada if (!currentGroup.isTerminal()) return null; 662ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (DBG && !string.equals(checker.toString())) return null; 663bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return currentGroup; 664bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 665bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 666bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 667eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard * Helper method to find out whether a word is in the dict or not. 668eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard */ 669eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard public boolean hasWord(final String s) { 670eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard if (null == s || "".equals(s)) { 671eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard throw new RuntimeException("Can't search for a null or empty string"); 672eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 673eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard return null != findWordInTree(mRoot, s); 674eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 675eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard 676eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard /** 677bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Recursively count the number of character groups in a given branch of the trie. 678bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 679bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param node the parent node. 680bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the number of char groups in all the branch under this node. 681bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 682bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static int countCharGroups(final Node node) { 683bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int nodeSize = node.mData.size(); 684bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = nodeSize; 685bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = nodeSize - 1; i >= 0; --i) { 686bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup group = node.mData.get(i); 687bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null != group.mChildren) 688bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard size += countCharGroups(group.mChildren); 689bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 690bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 691bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 692bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 693bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 694bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Recursively count the number of nodes in a given branch of the trie. 695bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 696bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param node the node to count. 69720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return the number of nodes in this branch. 698bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 699bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static int countNodes(final Node node) { 700bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = 1; 701bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = node.mData.size() - 1; i >= 0; --i) { 702bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard CharGroup group = node.mData.get(i); 703bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null != group.mChildren) 704bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard size += countNodes(group.mChildren); 705bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 706bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 707bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 708bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 70920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // Recursively find out whether there are any bigrams. 71020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // This can be pretty expensive especially if there aren't any (we return as soon 71120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // as we find one, so it's much cheaper if there are bigrams) 71220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard private static boolean hasBigramsInternal(final Node node) { 71320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard if (null == node) return false; 71420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard for (int i = node.mData.size() - 1; i >= 0; --i) { 71520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard CharGroup group = node.mData.get(i); 71620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard if (null != group.mBigrams) return true; 71720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard if (hasBigramsInternal(group.mChildren)) return true; 71820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 71920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard return false; 72020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 72120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 72220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard /** 72320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * Finds out whether there are any bigrams in this dictionary. 72420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * 72520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return true if there is any bigram, false otherwise. 72620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard */ 72720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // TODO: this is expensive especially for large dictionaries without any bigram. 72820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // The up side is, this is always accurate and correct and uses no memory. We should 72920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // find a more efficient way of doing this, without compromising too much on memory 73020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // and ease of use. 73120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard public boolean hasBigrams() { 73220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard return hasBigramsInternal(mRoot); 73320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 73420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 735bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Historically, the tails of the words were going to be merged to save space. 736bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // However, that would prevent the code to search for a specific address in log(n) 737bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // time so this was abandoned. 738bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The code is still of interest as it does add some compression to any dictionary 739bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // that has no need for attributes. Implementations that does not read attributes should be 740bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // able to read a dictionary with merged tails. 741bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Also, the following code does support frequencies, as in, it will only merges 742bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that share the same frequency. Though it would result in the above loss of 743bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // performance while searching by address, it is still technically possible to merge 744bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that contain attributes, but this code does not take that into account - it does 745bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // not compare attributes and will merge terminals with different attributes regardless. 746bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void mergeTails() { 747bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard MakedictLog.i("Do not merge tails"); 748bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return; 749bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 750bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root)); 751bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of groups : " + countCharGroups(root)); 752bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 753bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final HashMap<String, ArrayList<Node>> repository = 754bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// new HashMap<String, ArrayList<Node>>(); 755bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// mergeTailsInner(repository, root); 756bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 757bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of different pseudohashes : " + repository.size()); 758bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// int size = 0; 759bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (ArrayList<Node> a : repository.values()) { 760bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// size += a.size(); 761bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 762bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of nodes after merge : " + (1 + size)); 763bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 764bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 765bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 766bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The following methods are used by the deactivated mergeTails() 767bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// private static boolean isEqual(Node a, Node b) { 768bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a && null == b) return true; 769bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a || null == b) return false; 770bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (a.data.size() != b.data.size()) return false; 771bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int size = a.data.size(); 772bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = size - 1; i >= 0; --i) { 773bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// CharGroup aGroup = a.data.get(i); 774bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// CharGroup bGroup = b.data.get(i); 775bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (aGroup.frequency != bGroup.frequency) return false; 776bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (aGroup.alternates == null && bGroup.alternates != null) return false; 777bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false; 778bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false; 779bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!isEqual(aGroup.children, bGroup.children)) return false; 780bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 781bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return true; 782bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 783bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 784bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// static private HashMap<String, ArrayList<Node>> mergeTailsInner( 785bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final HashMap<String, ArrayList<Node>> map, final Node node) { 786bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final ArrayList<CharGroup> branches = node.data; 787bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int nodeSize = branches.size(); 788bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = 0; i < nodeSize; ++i) { 789bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// CharGroup group = branches.get(i); 790bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null != group.children) { 791bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// String pseudoHash = getPseudoHash(group.children); 792bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// ArrayList<Node> similarList = map.get(pseudoHash); 793bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == similarList) { 794bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// similarList = new ArrayList<Node>(); 795bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// map.put(pseudoHash, similarList); 796bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 797bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// boolean merged = false; 798bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (Node similar : similarList) { 799bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (isEqual(group.children, similar)) { 800bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// group.children = similar; 801bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// merged = true; 802bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// break; 803bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 804bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 805bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!merged) { 806bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// similarList.add(group.children); 807bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 808bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// mergeTailsInner(map, group.children); 809bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 810bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 811bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return map; 812bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 813bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 814bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// private static String getPseudoHash(final Node node) { 815bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// StringBuilder s = new StringBuilder(); 816bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (CharGroup g : node.data) { 817bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// s.append(g.frequency); 818f2789819bd005b5b0581e8439601b5501306327dKen Wakasa// for (int ch : g.chars) { 819bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// s.append(Character.toChars(ch)); 820bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 821bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 822bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return s.toString(); 823bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 824bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 825bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 826bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Iterator to walk through a dictionary. 827bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 828bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is purely for convenience. 829bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 830a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryIterator implements Iterator<Word> { 831a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka private static final class Position { 832bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Iterator<CharGroup> pos; 833bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public int length; 834bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Position(ArrayList<CharGroup> groups) { 835bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard pos = groups.iterator(); 836bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard length = 0; 837bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 838bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 839bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final StringBuilder mCurrentString; 840bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final LinkedList<Position> mPositions; 841bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 842bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public DictionaryIterator(ArrayList<CharGroup> root) { 843bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString = new StringBuilder(); 844bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions = new LinkedList<Position>(); 845bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final Position rootPos = new Position(root); 846bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.add(rootPos); 847bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 848bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 849bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 850bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasNext() { 851bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (Position p : mPositions) { 852bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (p.pos.hasNext()) { 853bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return true; 854bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 855bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 856bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return false; 857bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 858bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 859bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 860bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Word next() { 861bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Position currentPos = mPositions.getLast(); 862a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(currentPos.length); 863bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 864bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 865bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentPos.pos.hasNext()) { 866bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final CharGroup currentGroup = currentPos.pos.next(); 867a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard currentPos.length = mCurrentString.length(); 868ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard for (int i : currentGroup.mChars) { 869bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString.append(Character.toChars(i)); 870ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 871bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null != currentGroup.mChildren) { 872bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = new Position(currentGroup.mChildren.mData); 873ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard currentPos.length = mCurrentString.length(); 874bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.addLast(currentPos); 875bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 876ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (currentGroup.mFrequency >= 0) { 877bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return new Word(mCurrentString.toString(), currentGroup.mFrequency, 87872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mShortcutTargets, currentGroup.mBigrams, 87972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry); 880ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 881bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 882bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.removeLast(); 883bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = mPositions.getLast(); 884a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(mPositions.getLast().length); 885bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 886f2789819bd005b5b0581e8439601b5501306327dKen Wakasa } while (true); 887bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 888bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 889bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 890bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void remove() { 891bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new UnsupportedOperationException("Unsupported yet"); 892bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 893bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 894bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 895bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 896bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 897bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Method to return an iterator. 898bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 899bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 900bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * and say : for (Word w : x) {} 901bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 902bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 903bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Iterator<Word> iterator() { 904bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return new DictionaryIterator(mRoot.mData); 905bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 906bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard} 907