FusionDictionary.java revision a141d8ef7dcf8f942eb7bd4ca006f63da1744319
1bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/* 2bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Copyright (C) 2011 The Android Open Source Project 3bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 7bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 9bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 10bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 15bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 16bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 17cdc51fc6afc7fd374c5c9eeb0539cae5cf1de724Tom Ouyangpackage com.android.inputmethod.latin.makedict; 18bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 19b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasaimport com.android.inputmethod.annotations.UsedForTesting; 20eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanadaimport com.android.inputmethod.latin.Constants; 21eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 22bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.ArrayList; 23bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Arrays; 24bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Collections; 2547cac57e4593f47e753410e4199e84e458d6de6fJean Chalardimport java.util.Date; 26c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalardimport java.util.HashMap; 27bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Iterator; 28bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.LinkedList; 29bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 30bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/** 31bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A dictionary that can fusion heads and tails of words for more compression. 32bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 33b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasa@UsedForTesting 34a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaokapublic final class FusionDictionary implements Iterable<Word> { 3547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard private static final boolean DBG = MakedictLog.DBG; 3647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard 3793445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard private static int CHARACTER_NOT_FOUND_INDEX = -1; 3893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard 39bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 40576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * A node array of the dictionary, containing several PtNodes. 41bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 42576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the 43bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * real information. 44bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This class also contains fields to cache size and address, to help with binary 45bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * generation. 46bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 47af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public static final class PtNodeArray { 48576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ArrayList<PtNode> mData; 49bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // To help with binary generation 50e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedSize = Integer.MIN_VALUE; 5191cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 5291cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // always hold the same value except between dictionary address compression, during which 5391cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the update process needs to know about both values at the same time. Updating will 5491cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // update the AfterUpdate value, and the code will move them to BeforeUpdate before 5591cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the next update pass. 5691cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 5791cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 58e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedParentAddress = 0; 59e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada 60af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public PtNodeArray() { 61576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada mData = new ArrayList<PtNode>(); 62bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 63576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNodeArray(ArrayList<PtNode> data) { 64bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = data; 65bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 66bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 67bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 68bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 69bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A string with a frequency. 70bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 71bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This represents an "attribute", that is either a bigram or a shortcut. 72bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 73a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class WeightedString { 7454e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public final String mWord; 7554e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public int mFrequency; 76bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public WeightedString(String word, int frequency) { 77bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mWord = word; 78bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 79bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 809f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 819f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 829f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public int hashCode() { 839f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return Arrays.hashCode(new Object[] { mWord, mFrequency }); 849f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 859f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 869f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 879f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public boolean equals(Object o) { 889f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (o == this) return true; 899f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (!(o instanceof WeightedString)) return false; 909f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa WeightedString w = (WeightedString)o; 919f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 929f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 93bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 94bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 95bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 96576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children 97576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * (Pt means Patricia Trie). 98bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 99576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This is the central class of the in-memory representation. A PtNode is what can 100bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * be seen as a traditional "trie node", except it can hold several characters at the 101576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * same time. A PtNode essentially represents one or several characters in the middle 102af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * of the trie tree; as such, it can be a terminal, and it can have children. 103576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * In this in-memory representation, whether the PtNode is a terminal or not is represented 104bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 105bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 106bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 107bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 108576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static final class PtNode { 109bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static final int NOT_A_TERMINAL = -1; 110bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int mChars[]; 1117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mShortcutTargets; 1127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mBigrams; 1137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 114a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. 115af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray mChildren; 11672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsNotAWord; // Only a shortcut 11772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsBlacklistEntry; 11825de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 11925de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // generation. Before and After always hold the same value except during dictionary 12025de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // address compression, where the update process needs to know about both values at the 12125de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // same time. Updating will update the AfterUpdate value, and the code will move them 12225de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // to BeforeUpdate before the next update pass. 12325de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // The update process does not need two versions of mCachedSize. 124576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedSize; // The size, in bytes, of this PtNode. 125576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) 126576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedAddressAfterUpdate; // The address of this PtNode (after update) 127bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 128576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 12972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 13072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 131bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 132bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 133a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada mTerminalId = frequency; 134eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 135bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 136bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = null; 13772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 13872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 139bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 140bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 141576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 14272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 143af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final boolean isNotAWord, final boolean isBlacklistEntry, 144af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final PtNodeArray children) { 145bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 146bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 147eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 148bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 149bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = children; 15072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 15172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 152bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 153bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 154576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public void addChild(PtNode n) { 155bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == mChildren) { 156af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard mChildren = new PtNodeArray(); 157bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 158bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren.mData.add(n); 159bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 160bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 161a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada public int getTerminalId() { 162a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada return mTerminalId; 163a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada } 164a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada 165bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean isTerminal() { 166bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return NOT_A_TERMINAL != mFrequency; 167bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 168bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 169b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard public int getFrequency() { 170b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard return mFrequency; 171b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard } 172b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard 173a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsNotAWord() { 174a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsNotAWord; 175a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 176a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 177a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsBlacklistEntry() { 178a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsBlacklistEntry; 179a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 180a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 181a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getShortcutTargets() { 182a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 183a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mShortcutTargets) return null; 184f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfShortcutTargets = 185f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard new ArrayList<WeightedString>(mShortcutTargets); 186a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfShortcutTargets; 187a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 188a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 189a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getBigrams() { 190a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 191a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mBigrams) return null; 192f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); 193a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfBigrams; 194a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 195a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 196bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasSeveralChars() { 197bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(mChars.length > 0); 198bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return 1 < mChars.length; 199bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 2007cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2017cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2027cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Adds a word to the bigram list. Updates the frequency if the word already 2037cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * exists. 2047cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2057cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void addBigram(final String word, final int frequency) { 2067cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2077cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = new ArrayList<WeightedString>(); 2087cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2097cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = getBigram(word); 2107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram != null) { 2117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram.mFrequency = frequency; 2127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram = new WeightedString(word, frequency); 2147cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2157cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2167cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2177cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2187cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the shortcut target for the given word. Returns null if the word is not in the 2207cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * shortcut list. 2217cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2227cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getShortcut(final String word) { 22347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2247cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets != null) { 2257cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mShortcutTargets.size(); 2267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString shortcut = mShortcutTargets.get(i); 2287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcut.mWord.equals(word)) { 2297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return shortcut; 2307cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2317cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2327cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2337cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2347cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the bigram for the given word. 2387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Returns null if the word is not in the bigrams list. 2397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getBigram(final String word) { 24147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2427cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams != null) { 2437cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mBigrams.size(); 2447cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2457cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = mBigrams.get(i); 2467cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram.mWord.equals(word)) { 2477cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return bigram; 2487cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2497cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2507cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2517cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2527cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2537cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2547cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 255576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to 2567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 2577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * updated if they are higher than the existing ones. 2587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 25972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 26072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, 26172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 2627cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (frequency > mFrequency) { 2637cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mFrequency = frequency; 2647cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2657cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcutTargets != null) { 2667cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets == null) { 2677cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets = shortcutTargets; 2687cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2697cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = shortcutTargets.size(); 2707cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2717cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString shortcut = shortcutTargets.get(i); 2727cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingShortcut = getShortcut(shortcut.mWord); 2737cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingShortcut == null) { 2747cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets.add(shortcut); 2757cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 2767cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingShortcut.mFrequency = shortcut.mFrequency; 2777cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2787cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2797cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2807cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2817cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigrams != null) { 2827cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2837cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = bigrams; 2847cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = bigrams.size(); 2867cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2877cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString bigram = bigrams.get(i); 2887cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingBigram = getBigram(bigram.mWord); 2897cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingBigram == null) { 2907cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2917cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingBigram.mFrequency < bigram.mFrequency) { 2927cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingBigram.mFrequency = bigram.mFrequency; 2937cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2947cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2957cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2967cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 29772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 29872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 2997cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 300bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 301bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 302bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 303bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Options global to the dictionary. 304bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 305a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryOptions { 306f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mGermanUmlautProcessing; 307f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mFrenchLigatureProcessing; 308f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final HashMap<String, String> mAttributes; 309f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public DictionaryOptions(final HashMap<String, String> attributes, 310f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 311c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mAttributes = attributes; 312f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mGermanUmlautProcessing = germanUmlautProcessing; 313f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mFrenchLigatureProcessing = frenchLigatureProcessing; 314c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 31547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard @Override 31647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard public String toString() { // Convenience method 31751a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard return toString(0, false); 31847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 31951a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard public String toString(final int indentCount, final boolean plumbing) { 32047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder indent = new StringBuilder(); 32151a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if (plumbing) { 32251a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append("H:"); 32351a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } else { 32451a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard for (int i = 0; i < indentCount; ++i) { 32551a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append(" "); 32651a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } 32747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 32847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder s = new StringBuilder(); 32947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard for (final String optionKey : mAttributes.keySet()) { 33047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 33147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(optionKey); 33247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(" = "); 33351a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if ("date".equals(optionKey) && !plumbing) { 33447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard // Date needs a number of milliseconds, but the dictionary contains seconds 33547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(new Date( 33647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 33747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } else { 33847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(mAttributes.get(optionKey)); 33947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("\n"); 34147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mGermanUmlautProcessing) { 34347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 34447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs German umlaut processing\n"); 34547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mFrenchLigatureProcessing) { 34747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 34847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs French ligature processing\n"); 34947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 35047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard return s.toString(); 35147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 352bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 353bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 354bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final DictionaryOptions mOptions; 355af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public final PtNodeArray mRootNodeArray; 356bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 357af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { 358af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard mRootNodeArray = rootNodeArray; 359bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mOptions = options; 360bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 361bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 362c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard public void addOptionAttribute(final String key, final String value) { 363c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mOptions.mAttributes.put(key, value); 364c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 365c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard 366bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 367bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to convert a String to an int array. 368bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 3693c6d9fe14840fd2c455ec65b6481ed78f99a5460Yuichiro Hanada static int[] getCodePoints(final String word) { 37047db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray, 37147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // which is not visible from the makedict package. Factor this code. 37247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final char[] characters = word.toCharArray(); 37347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int length = characters.length; 37447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 37547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int codePoint = Character.codePointAt(characters, 0); 37647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int dsti = 0; 37747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard for (int srci = Character.charCount(codePoint); 37847db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard srci < length; srci += Character.charCount(codePoint), ++dsti) { 37947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 38047db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoint = Character.codePointAt(characters, srci); 381c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 38247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 38347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard return codePoints; 384c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 385c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard 386c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard /** 387bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to add a word as a string. 388bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 389bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method adds a word to the dictionary with the given frequency. Optional 390bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * lists of bigrams and shortcuts can be passed here. For each word inside, 391bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * they will be added to the dictionary as necessary. 392bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 393bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word to add. 394bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 395eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 39672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 397bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 398eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public void add(final String word, final int frequency, 39972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 40072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 40172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 40272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard } 40372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard 40472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard /** 40572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * Helper method to add a blacklist entry as a string. 40672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * 40772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param word the word to add as a blacklist entry. 40872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 40972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 41072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard */ 41172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void addBlacklistEntry(final String word, 41272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 41372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 414bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 415bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 416bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 417576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Sanity check for a PtNode array. 418bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 419576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This method checks that all PtNodes in a node array are ordered as expected. 420bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * If they are, nothing happens. If they aren't, an exception is thrown. 421bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 422576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private void checkStack(PtNodeArray ptNodeArray) { 423576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ArrayList<PtNode> stack = ptNodeArray.mData; 424bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int lastValue = -1; 425bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 0; i < stack.size(); ++i) { 426bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int currentValue = stack.get(i).mChars[0]; 427bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentValue <= lastValue) 428bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new RuntimeException("Invalid stack"); 429bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard else 430bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard lastValue = currentValue; 431bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 432bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 433bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 434bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 4357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Helper method to add a new bigram to the dictionary. 4367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * 4377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word1 the previous word of the context 4387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word2 the next word of the context 4397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param frequency the bigram frequency 4407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 4417cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void setBigram(final String word1, final String word2, final int frequency) { 442576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = findWordInTree(mRootNodeArray, word1); 443576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNode != null) { 444576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2); 445576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNode2 == null) { 44672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word2), 0, null, false /* isNotAWord */, 44772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 448576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // The PtNode for the first word may have moved by the above insertion, 449ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // if word1 and word2 share a common stem that happens not to have been 450576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // a cutting point until now. In this case, we need to refresh ptNode. 451576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode = findWordInTree(mRootNodeArray, word1); 4527cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 453576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode.addBigram(word2, frequency); 4547cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 4557cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang throw new RuntimeException("First word of bigram not found"); 4567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 4597cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 460bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Add a word to this dictionary. 461bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 46244c64f46a143623dd793facd889c8d6eab5e230cJean Chalard * The shortcuts, if any, have to be in the dictionary already. If they aren't, 463bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * an exception is thrown. 464bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 465bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word, as an int array. 466bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 467eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 46872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 46972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 470bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 471eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard private void add(final int[] word, final int frequency, 47272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, 47372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 474bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(frequency >= 0 && frequency <= 255); 475ffcbbaf12788a9fc9398607a548e552d7d2bf05eSatoshi Kataoka if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { 476eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 477eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada return; 478eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada } 479eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 480af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray currentNodeArray = mRootNodeArray; 481bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int charIndex = 0; 482bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 483576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode currentPtNode = null; 484bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int differentCharIndex = 0; // Set by the loop to the index of the char that differs 485af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); 48693445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 487576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode = currentNodeArray.mData.get(nodeIndex); 488576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); 489bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (ARRAYS_ARE_EQUAL != differentCharIndex 490576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada && differentCharIndex < currentPtNode.mChars.length) break; 491576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null == currentPtNode.mChildren) break; 492576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada charIndex += currentPtNode.mChars.length; 493bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex >= word.length) break; 494576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentNodeArray = currentPtNode.mChildren; 495af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); 496bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 497bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 49893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 499bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // No node at this point to accept the word. Create one. 500af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); 501576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), 50272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 503576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentNodeArray.mData.add(insertionIndex, newPtNode); 504af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (DBG) checkStack(currentNodeArray); 505bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 506bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // There is a word with a common prefix. 507576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (differentCharIndex == currentPtNode.mChars.length) { 508bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 509bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word is a prefix of an existing word, but the node on which it 510576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // should end already exists as is. Since the old PtNode was not a terminal, 5117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // make it one by filling in its frequency and other attributes 512576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.update(frequency, shortcutTargets, null, isNotAWord, 51372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 514bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 515bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word matches the full old word and extends past it. 516bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We only have to create a new node and add it to the end of this. 517576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newNode = new PtNode( 518bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 51972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, 52072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 521576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren = new PtNodeArray(); 522576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren.mData.add(newNode); 523bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 524bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 525bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (0 == differentCharIndex) { 5267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // Exact same word. Update the frequency if higher. This will also add the 52744c64f46a143623dd793facd889c8d6eab5e230cJean Chalard // new shortcuts to the existing shortcut list if it already exists. 528576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.update(frequency, shortcutTargets, null, 529576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord && isNotAWord, 530576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsBlacklistEntry || isBlacklistEntry); 531bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 532bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Partial prefix match only. We have to replace the current node with a node 533bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // containing the current prefix and create two new ones for the tails. 534af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray newChildren = new PtNodeArray(); 535576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newOldWord = new PtNode( 536576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, 537576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChars.length), currentPtNode.mShortcutTargets, 538576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mBigrams, currentPtNode.mFrequency, 539576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, 540576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren); 541bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(newOldWord); 542bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 543576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newParent; 544bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 545576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada newParent = new PtNode( 546576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 54772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 54872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry, newChildren); 549bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 550576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada newParent = new PtNode( 551576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 55245239029ceb876462e0d3f654c6b24ac9a9ed8afKen Wakasa null /* shortcutTargets */, null /* bigrams */, -1, 55372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 554576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newWord = new PtNode(Arrays.copyOfRange(word, 55544c64f46a143623dd793facd889c8d6eab5e230cJean Chalard charIndex + differentCharIndex, word.length), 55672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 55772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry); 558bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int addIndex = word[charIndex + differentCharIndex] 559576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada > currentPtNode.mChars[differentCharIndex] ? 1 : 0; 560bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(addIndex, newWord); 561bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 562af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard currentNodeArray.mData.set(nodeIndex, newParent); 563bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 564af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (DBG) checkStack(currentNodeArray); 565bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 566bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 567bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 568bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 56993ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka private static int ARRAYS_ARE_EQUAL = 0; 57093ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka 571bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 572bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Custom comparison of two int arrays taken to contain character codes. 573bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 574bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method compares the two arrays passed as an argument in a lexicographic way, 575bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * with an offset in the dst string. 576bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method does NOT test for the first character. It is taken to be equal. 577bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 578bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 579bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * strings are equal. This works BECAUSE we don't look at the first character. 580bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 581bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param src the left-hand side string of the comparison. 582bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dst the right-hand side string of the comparison. 583bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dstOffset the offset in the right-hand side string. 584bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 585bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 586af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { 587bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We do NOT test the first char, because we come from a method that already 588bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tested it. 589bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 1; i < src.length; ++i) { 590bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dstOffset + i >= dst.length) return i; 591bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (src[i] != dst[dstOffset + i]) return i; 592bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 593bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dst.length > src.length) return src.length; 594bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return ARRAYS_ARE_EQUAL; 595bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 596bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 597bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 598576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Helper class that compares and sorts two PtNodes according to their 599bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * first element only. I repeat: ONLY the first element is considered, the rest 600bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * is ignored. 601bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This comparator imposes orderings that are inconsistent with equals. 602bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 603576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static private final class PtNodeComparator implements java.util.Comparator<PtNode> { 60493ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka @Override 605576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public int compare(PtNode p1, PtNode p2) { 606576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (p1.mChars[0] == p2.mChars[0]) return 0; 607576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return p1.mChars[0] < p2.mChars[0] ? -1 : 1; 608bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 609bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 610576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); 611bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 612bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 613af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Finds the insertion index of a character within a node array. 614bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 615af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { 616576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final ArrayList<PtNode> data = nodeArray.mData; 617576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode reference = new PtNode(new int[] { character }, 61872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 61972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 620576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); 621bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return result >= 0 ? result : -result - 1; 622bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 623bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 624bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 625af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Find the index of a char in a node array, if it exists. 626bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 627af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the node array to search in. 628bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param character the character to search for. 62993445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 630bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 631af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { 632af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int insertionIndex = findInsertionIndex(nodeArray, character); 633af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 634af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex 63593445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard : CHARACTER_NOT_FOUND_INDEX; 636bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 637bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 638bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 639bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to find a word in a given branch. 640bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 6412da886651874b2588f18f800417ba858ac93d88bJean Chalard @SuppressWarnings("unused") 642576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { 643bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int index = 0; 64412efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard final StringBuilder checker = DBG ? new StringBuilder() : null; 645a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard final int[] codePoints = getCodePoints(string); 646bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 647576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode currentPtNode; 648bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 649af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); 65093445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 651576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode = nodeArray.mData.get(indexOfGroup); 65266f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 653576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (codePoints.length - index < currentPtNode.mChars.length) return null; 65466f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada int newIndex = index; 655576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { 656576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; 65766f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada newIndex++; 65866f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada } 65966f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada index = newIndex; 66066f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 661576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (DBG) { 662576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); 663576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada } 664a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) { 665576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada nodeArray = currentPtNode.mChildren; 666bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 667af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard } while (null != nodeArray && index < codePoints.length); 668bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 669a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) return null; 670576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (!currentPtNode.isTerminal()) return null; 671ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (DBG && !string.equals(checker.toString())) return null; 672576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return currentPtNode; 673bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 674bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 675bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 676eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard * Helper method to find out whether a word is in the dict or not. 677eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard */ 678eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard public boolean hasWord(final String s) { 679eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard if (null == s || "".equals(s)) { 680eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard throw new RuntimeException("Can't search for a null or empty string"); 681eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 682af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return null != findWordInTree(mRootNodeArray, s); 683eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 684eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard 685eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard /** 686576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Recursively count the number of PtNodes in a given branch of the trie. 687bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 688af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the parent node. 689576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the number of PtNodes in all the branch under this node. 690bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 691576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static int countPtNodes(final PtNodeArray nodeArray) { 692af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int nodeSize = nodeArray.mData.size(); 693bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = nodeSize; 694bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = nodeSize - 1; i >= 0; --i) { 695576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 696576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mChildren) 697576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += countPtNodes(ptNode.mChildren); 698bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 699bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 700bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 701bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 702bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 703bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Recursively count the number of nodes in a given branch of the trie. 704bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 705af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the node array to count. 70620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return the number of nodes in this branch. 707bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 708af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public static int countNodeArrays(final PtNodeArray nodeArray) { 709bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = 1; 710af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { 711576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 712576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mChildren) 713576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += countNodeArrays(ptNode.mChildren); 714bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 715bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 716bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 717bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 71820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // Recursively find out whether there are any bigrams. 71920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // This can be pretty expensive especially if there aren't any (we return as soon 72020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // as we find one, so it's much cheaper if there are bigrams) 721af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static boolean hasBigramsInternal(final PtNodeArray nodeArray) { 722af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (null == nodeArray) return false; 723af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { 724576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 725576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mBigrams) return true; 726576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (hasBigramsInternal(ptNode.mChildren)) return true; 72720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 72820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard return false; 72920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 73020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 73120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard /** 73220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * Finds out whether there are any bigrams in this dictionary. 73320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * 73420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return true if there is any bigram, false otherwise. 73520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard */ 73620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // TODO: this is expensive especially for large dictionaries without any bigram. 73720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // The up side is, this is always accurate and correct and uses no memory. We should 73820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // find a more efficient way of doing this, without compromising too much on memory 73920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // and ease of use. 74020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard public boolean hasBigrams() { 741af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return hasBigramsInternal(mRootNodeArray); 74220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 74320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 744bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Historically, the tails of the words were going to be merged to save space. 745bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // However, that would prevent the code to search for a specific address in log(n) 746bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // time so this was abandoned. 747bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The code is still of interest as it does add some compression to any dictionary 748bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // that has no need for attributes. Implementations that does not read attributes should be 749bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // able to read a dictionary with merged tails. 750bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Also, the following code does support frequencies, as in, it will only merges 751bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that share the same frequency. Though it would result in the above loss of 752bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // performance while searching by address, it is still technically possible to merge 753bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that contain attributes, but this code does not take that into account - it does 754bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // not compare attributes and will merge terminals with different attributes regardless. 755bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void mergeTails() { 756bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard MakedictLog.i("Do not merge tails"); 757bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return; 758bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 759576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root)); 760576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// MakedictLog.i("Number of PtNodes : " + countPtNodes(root)); 761bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 762af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// final HashMap<String, ArrayList<PtNodeArray>> repository = 763af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// new HashMap<String, ArrayList<PtNodeArray>>(); 764bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// mergeTailsInner(repository, root); 765bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 766bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of different pseudohashes : " + repository.size()); 767bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// int size = 0; 768af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// for (ArrayList<PtNodeArray> a : repository.values()) { 769bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// size += a.size(); 770bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 771bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of nodes after merge : " + (1 + size)); 772bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 773bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 774bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 775bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The following methods are used by the deactivated mergeTails() 776af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// private static boolean isEqual(PtNodeArray a, PtNodeArray b) { 777bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a && null == b) return true; 778bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a || null == b) return false; 779bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (a.data.size() != b.data.size()) return false; 780bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int size = a.data.size(); 781bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = size - 1; i >= 0; --i) { 782576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// PtNode aPtNode = a.data.get(i); 783576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// PtNode bPtNode = b.data.get(i); 784576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (aPtNode.frequency != bPtNode.frequency) return false; 785576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (aPtNode.alternates == null && bPtNode.alternates != null) return false; 786576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false; 787576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false; 788576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (!isEqual(aPtNode.children, bPtNode.children)) return false; 789bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 790bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return true; 791bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 792bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 793af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner( 794af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) { 795576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// final ArrayList<PtNode> branches = nodeArray.data; 796bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int nodeSize = branches.size(); 797bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = 0; i < nodeSize; ++i) { 798576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// PtNode ptNode = branches.get(i); 799576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (null != ptNode.children) { 800576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// String pseudoHash = getPseudoHash(ptNode.children); 801af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// ArrayList<PtNodeArray> similarList = map.get(pseudoHash); 802bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == similarList) { 803af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// similarList = new ArrayList<PtNodeArray>(); 804bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// map.put(pseudoHash, similarList); 805bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 806bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// boolean merged = false; 807af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// for (PtNodeArray similar : similarList) { 808576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (isEqual(ptNode.children, similar)) { 809576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// ptNode.children = similar; 810bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// merged = true; 811bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// break; 812bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 813bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 814bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!merged) { 815576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// similarList.add(ptNode.children); 816bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 817576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// mergeTailsInner(map, ptNode.children); 818bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 819bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 820bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return map; 821bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 822bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 823af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// private static String getPseudoHash(final PtNodeArray nodeArray) { 824bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// StringBuilder s = new StringBuilder(); 825576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// for (PtNode ptNode : nodeArray.data) { 826576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// s.append(ptNode.frequency); 827576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// for (int ch : ptNode.chars) { 828bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// s.append(Character.toChars(ch)); 829bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 830bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 831bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return s.toString(); 832bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 833bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 834bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 835bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Iterator to walk through a dictionary. 836bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 837bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is purely for convenience. 838bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 839a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryIterator implements Iterator<Word> { 840a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka private static final class Position { 841576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public Iterator<PtNode> pos; 842bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public int length; 843576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public Position(ArrayList<PtNode> ptNodes) { 844576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada pos = ptNodes.iterator(); 845bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard length = 0; 846bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 847bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 848bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final StringBuilder mCurrentString; 849bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final LinkedList<Position> mPositions; 850bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 851576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public DictionaryIterator(ArrayList<PtNode> ptRoot) { 852bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString = new StringBuilder(); 853bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions = new LinkedList<Position>(); 854576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final Position rootPos = new Position(ptRoot); 855bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.add(rootPos); 856bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 857bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 858bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 859bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasNext() { 860bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (Position p : mPositions) { 861bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (p.pos.hasNext()) { 862bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return true; 863bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 864bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 865bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return false; 866bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 867bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 868bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 869bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Word next() { 870bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Position currentPos = mPositions.getLast(); 871a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(currentPos.length); 872bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 873bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 874bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentPos.pos.hasNext()) { 875576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode currentPtNode = currentPos.pos.next(); 876a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard currentPos.length = mCurrentString.length(); 877576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (int i : currentPtNode.mChars) { 878bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString.append(Character.toChars(i)); 879ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 880576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != currentPtNode.mChildren) { 881576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPos = new Position(currentPtNode.mChildren.mData); 882ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard currentPos.length = mCurrentString.length(); 883bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.addLast(currentPos); 884bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 885576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (currentPtNode.mFrequency >= 0) { 886576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return new Word(mCurrentString.toString(), currentPtNode.mFrequency, 887576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mShortcutTargets, currentPtNode.mBigrams, 888576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); 889ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 890bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 891bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.removeLast(); 892bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = mPositions.getLast(); 893a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(mPositions.getLast().length); 894bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 895f2789819bd005b5b0581e8439601b5501306327dKen Wakasa } while (true); 896bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 897bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 898bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 899bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void remove() { 900bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new UnsupportedOperationException("Unsupported yet"); 901bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 902bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 903bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 904bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 905bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 906bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Method to return an iterator. 907bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 908bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 909bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * and say : for (Word w : x) {} 910bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 911bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 912bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Iterator<Word> iterator() { 913af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return new DictionaryIterator(mRootNodeArray.mData); 914bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 915bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard} 916