FusionDictionary.java revision ca9c3c06137af878607f16573585c72041a4b7bf
1bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/* 2bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Copyright (C) 2011 The Android Open Source Project 3bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 7bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 9bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 10bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 15bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 16bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 17cdc51fc6afc7fd374c5c9eeb0539cae5cf1de724Tom Ouyangpackage com.android.inputmethod.latin.makedict; 18bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 19b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasaimport com.android.inputmethod.annotations.UsedForTesting; 20eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanadaimport com.android.inputmethod.latin.Constants; 21eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 22bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.ArrayList; 23bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Arrays; 24bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Collections; 2547cac57e4593f47e753410e4199e84e458d6de6fJean Chalardimport java.util.Date; 26c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalardimport java.util.HashMap; 27bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Iterator; 28bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.LinkedList; 29bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 30bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/** 31bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A dictionary that can fusion heads and tails of words for more compression. 32bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 33b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasa@UsedForTesting 34a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaokapublic final class FusionDictionary implements Iterable<Word> { 3547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard private static final boolean DBG = MakedictLog.DBG; 3647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard 3793445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard private static int CHARACTER_NOT_FOUND_INDEX = -1; 3893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard 39bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 40576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * A node array of the dictionary, containing several PtNodes. 41bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 42576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the 43bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * real information. 44bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This class also contains fields to cache size and address, to help with binary 45bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * generation. 46bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 47af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public static final class PtNodeArray { 48576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ArrayList<PtNode> mData; 49bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // To help with binary generation 50e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedSize = Integer.MIN_VALUE; 5191cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 5291cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // always hold the same value except between dictionary address compression, during which 5391cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the update process needs to know about both values at the same time. Updating will 5491cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // update the AfterUpdate value, and the code will move them to BeforeUpdate before 5591cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the next update pass. 5691cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 5791cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 58e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedParentAddress = 0; 59e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada 60af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public PtNodeArray() { 61576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada mData = new ArrayList<PtNode>(); 62bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 63576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNodeArray(ArrayList<PtNode> data) { 64bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = data; 65bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 66bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 67bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 68bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 69bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A string with a frequency. 70bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 71bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This represents an "attribute", that is either a bigram or a shortcut. 72bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 73a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class WeightedString { 7454e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public final String mWord; 7554e84a00fc032ba566cbda41feafa71de77e1c43Jean Chalard public int mFrequency; 76bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public WeightedString(String word, int frequency) { 77bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mWord = word; 78bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 79bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 809f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 819f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 829f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public int hashCode() { 839f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return Arrays.hashCode(new Object[] { mWord, mFrequency }); 849f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 859f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa 869f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa @Override 879f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa public boolean equals(Object o) { 889f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (o == this) return true; 899f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa if (!(o instanceof WeightedString)) return false; 909f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa WeightedString w = (WeightedString)o; 919f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 929f0ea52a5db9710df6bef4672d8e193c48451df0Ken Wakasa } 93bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 94bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 95bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 96576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children 97576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * (Pt means Patricia Trie). 98bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 99576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This is the central class of the in-memory representation. A PtNode is what can 100bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * be seen as a traditional "trie node", except it can hold several characters at the 101576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * same time. A PtNode essentially represents one or several characters in the middle 102af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * of the trie tree; as such, it can be a terminal, and it can have children. 103576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * In this in-memory representation, whether the PtNode is a terminal or not is represented 104bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 105bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 106bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 107bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 108576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static final class PtNode { 109bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public static final int NOT_A_TERMINAL = -1; 110bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int mChars[]; 1117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mShortcutTargets; 1127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mBigrams; 1137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 114a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. 115af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray mChildren; 11672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsNotAWord; // Only a shortcut 11772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsBlacklistEntry; 11825de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 11925de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // generation. Before and After always hold the same value except during dictionary 12025de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // address compression, where the update process needs to know about both values at the 12125de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // same time. Updating will update the AfterUpdate value, and the code will move them 12225de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // to BeforeUpdate before the next update pass. 12325de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // The update process does not need two versions of mCachedSize. 124576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedSize; // The size, in bytes, of this PtNode. 125576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) 126576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedAddressAfterUpdate; // The address of this PtNode (after update) 127bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 128576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 12972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 13072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 131bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 132bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 133a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada mTerminalId = frequency; 134eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 135bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 136bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = null; 13772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 13872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 139bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 140bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 141576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 14272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, final int frequency, 143af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final boolean isNotAWord, final boolean isBlacklistEntry, 144af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final PtNodeArray children) { 145bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 146bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mFrequency = frequency; 147eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 148bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 149bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = children; 15072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 15172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 152bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 153bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 154576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public void addChild(PtNode n) { 155bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == mChildren) { 156af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard mChildren = new PtNodeArray(); 157bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 158bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren.mData.add(n); 159bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 160bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 161a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada public int getTerminalId() { 162a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada return mTerminalId; 163a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada } 164a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada 165bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean isTerminal() { 166bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return NOT_A_TERMINAL != mFrequency; 167bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 168bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 169b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard public int getFrequency() { 170b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard return mFrequency; 171b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard } 172b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard 173a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsNotAWord() { 174a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsNotAWord; 175a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 176a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 177a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsBlacklistEntry() { 178a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsBlacklistEntry; 179a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 180a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 181a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getShortcutTargets() { 182a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 183a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mShortcutTargets) return null; 184f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfShortcutTargets = 185f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard new ArrayList<WeightedString>(mShortcutTargets); 186a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfShortcutTargets; 187a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 188a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 189a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getBigrams() { 190a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 191a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mBigrams) return null; 192f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); 193a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfBigrams; 194a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 195a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 196bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasSeveralChars() { 197bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(mChars.length > 0); 198bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return 1 < mChars.length; 199bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 2007cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2017cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2027cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Adds a word to the bigram list. Updates the frequency if the word already 2037cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * exists. 2047cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2057cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void addBigram(final String word, final int frequency) { 2067cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2077cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = new ArrayList<WeightedString>(); 2087cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2097cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = getBigram(word); 2107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram != null) { 2117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram.mFrequency = frequency; 2127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang bigram = new WeightedString(word, frequency); 2147cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2157cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2167cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2177cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2187cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the shortcut target for the given word. Returns null if the word is not in the 2207cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * shortcut list. 2217cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2227cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getShortcut(final String word) { 22347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2247cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets != null) { 2257cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mShortcutTargets.size(); 2267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString shortcut = mShortcutTargets.get(i); 2287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcut.mWord.equals(word)) { 2297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return shortcut; 2307cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2317cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2327cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2337cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2347cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the bigram for the given word. 2387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Returns null if the word is not in the bigrams list. 2397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getBigram(final String word) { 24147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2427cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams != null) { 2437cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mBigrams.size(); 2447cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2457cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = mBigrams.get(i); 2467cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram.mWord.equals(word)) { 2477cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return bigram; 2487cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2497cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2507cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2517cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2527cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2537cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2547cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 255576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to 2567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 2577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * updated if they are higher than the existing ones. 2587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 25972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 26072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, 26172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 2627cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (frequency > mFrequency) { 2637cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mFrequency = frequency; 2647cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2657cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcutTargets != null) { 2667cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets == null) { 2677cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets = shortcutTargets; 2687cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2697cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = shortcutTargets.size(); 2707cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2717cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString shortcut = shortcutTargets.get(i); 2727cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingShortcut = getShortcut(shortcut.mWord); 2737cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingShortcut == null) { 2747cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets.add(shortcut); 2757cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 2767cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingShortcut.mFrequency = shortcut.mFrequency; 2777cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2787cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2797cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2807cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2817cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigrams != null) { 2827cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2837cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = bigrams; 2847cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = bigrams.size(); 2867cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2877cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString bigram = bigrams.get(i); 2887cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingBigram = getBigram(bigram.mWord); 2897cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingBigram == null) { 2907cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2917cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else if (existingBigram.mFrequency < bigram.mFrequency) { 2927cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang existingBigram.mFrequency = bigram.mFrequency; 2937cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2947cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2957cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2967cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 29772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 29872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 2997cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 300bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 301bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 302bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 303bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Options global to the dictionary. 304bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 305a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryOptions { 306f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mGermanUmlautProcessing; 307f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final boolean mFrenchLigatureProcessing; 308f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public final HashMap<String, String> mAttributes; 309f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard public DictionaryOptions(final HashMap<String, String> attributes, 310f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 311c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mAttributes = attributes; 312f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mGermanUmlautProcessing = germanUmlautProcessing; 313f420df28233c26e555d203185fb292e83b94b8c3Jean Chalard mFrenchLigatureProcessing = frenchLigatureProcessing; 314c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 31547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard @Override 31647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard public String toString() { // Convenience method 31751a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard return toString(0, false); 31847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 31951a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard public String toString(final int indentCount, final boolean plumbing) { 32047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder indent = new StringBuilder(); 32151a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if (plumbing) { 32251a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append("H:"); 32351a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } else { 32451a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard for (int i = 0; i < indentCount; ++i) { 32551a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard indent.append(" "); 32651a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard } 32747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 32847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard final StringBuilder s = new StringBuilder(); 32947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard for (final String optionKey : mAttributes.keySet()) { 33047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 33147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(optionKey); 33247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(" = "); 33351a0ef8c59ea590b6e5e80a82fc75bf244084270Jean Chalard if ("date".equals(optionKey) && !plumbing) { 33447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard // Date needs a number of milliseconds, but the dictionary contains seconds 33547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(new Date( 33647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 33747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } else { 33847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(mAttributes.get(optionKey)); 33947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("\n"); 34147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34247cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mGermanUmlautProcessing) { 34347cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 34447cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs German umlaut processing\n"); 34547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 34647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard if (mFrenchLigatureProcessing) { 34747cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append(indent); 34847cac57e4593f47e753410e4199e84e458d6de6fJean Chalard s.append("Needs French ligature processing\n"); 34947cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 35047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard return s.toString(); 35147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard } 352bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 353bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 354bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final DictionaryOptions mOptions; 355af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public final PtNodeArray mRootNodeArray; 356bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 357af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { 358af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard mRootNodeArray = rootNodeArray; 359bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mOptions = options; 360bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 361bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 362c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard public void addOptionAttribute(final String key, final String value) { 363c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mOptions.mAttributes.put(key, value); 364c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 365c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard 366bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 367bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to convert a String to an int array. 368bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 3693c6d9fe14840fd2c455ec65b6481ed78f99a5460Yuichiro Hanada static int[] getCodePoints(final String word) { 370ca9c3c06137af878607f16573585c72041a4b7bfJean Chalard // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, 37147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // which is not visible from the makedict package. Factor this code. 372ca9c3c06137af878607f16573585c72041a4b7bfJean Chalard final int length = word.length(); 373ca9c3c06137af878607f16573585c72041a4b7bfJean Chalard if (length <= 0) return new int[] {}; 37447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final char[] characters = word.toCharArray(); 37547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 37647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int codePoint = Character.codePointAt(characters, 0); 37747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int dsti = 0; 37847db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard for (int srci = Character.charCount(codePoint); 37947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard srci < length; srci += Character.charCount(codePoint), ++dsti) { 38047db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 38147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoint = Character.codePointAt(characters, srci); 382c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 38347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 38447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard return codePoints; 385c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 386c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard 387c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard /** 388bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to add a word as a string. 389bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 390bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method adds a word to the dictionary with the given frequency. Optional 391bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * lists of bigrams and shortcuts can be passed here. For each word inside, 392bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * they will be added to the dictionary as necessary. 393bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 394bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word to add. 395bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 396eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 39772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 398bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 399eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard public void add(final String word, final int frequency, 40072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 40172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 40272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 40372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard } 40472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard 40572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard /** 40672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * Helper method to add a blacklist entry as a string. 40772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * 40872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param word the word to add as a blacklist entry. 40972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 41072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 41172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard */ 41272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void addBlacklistEntry(final String word, 41372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 41472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 415bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 416bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 417bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 418576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Sanity check for a PtNode array. 419bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 420576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This method checks that all PtNodes in a node array are ordered as expected. 421bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * If they are, nothing happens. If they aren't, an exception is thrown. 422bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 423576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private void checkStack(PtNodeArray ptNodeArray) { 424576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ArrayList<PtNode> stack = ptNodeArray.mData; 425bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int lastValue = -1; 426bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 0; i < stack.size(); ++i) { 427bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int currentValue = stack.get(i).mChars[0]; 428bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentValue <= lastValue) 429bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new RuntimeException("Invalid stack"); 430bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard else 431bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard lastValue = currentValue; 432bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 433bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 434bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 435bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 4367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Helper method to add a new bigram to the dictionary. 4377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * 4387cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word1 the previous word of the context 4397cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param word2 the next word of the context 4407cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * @param frequency the bigram frequency 4417cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 4427cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public void setBigram(final String word1, final String word2, final int frequency) { 443576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = findWordInTree(mRootNodeArray, word1); 444576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNode != null) { 445576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2); 446576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNode2 == null) { 44772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard add(getCodePoints(word2), 0, null, false /* isNotAWord */, 44872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 449576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // The PtNode for the first word may have moved by the above insertion, 450ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // if word1 and word2 share a common stem that happens not to have been 451576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // a cutting point until now. In this case, we need to refresh ptNode. 452576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode = findWordInTree(mRootNodeArray, word1); 4537cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 454576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode.addBigram(word2, frequency); 4557cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 4567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang throw new RuntimeException("First word of bigram not found"); 4577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 4597cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 4607cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 461bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Add a word to this dictionary. 462bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 46344c64f46a143623dd793facd889c8d6eab5e230cJean Chalard * The shortcuts, if any, have to be in the dictionary already. If they aren't, 464bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * an exception is thrown. 465bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 466bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word, as an int array. 467bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param frequency the frequency of the word, in the range [0..255]. 468eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 46972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 47072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 471bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 472eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard private void add(final int[] word, final int frequency, 47372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, 47472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 475bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(frequency >= 0 && frequency <= 255); 476ffcbbaf12788a9fc9398607a548e552d7d2bf05eSatoshi Kataoka if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { 477eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 478eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada return; 479eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada } 480eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 481af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray currentNodeArray = mRootNodeArray; 482bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int charIndex = 0; 483bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 484576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode currentPtNode = null; 485bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int differentCharIndex = 0; // Set by the loop to the index of the char that differs 486af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); 48793445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 488576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode = currentNodeArray.mData.get(nodeIndex); 489576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); 490bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (ARRAYS_ARE_EQUAL != differentCharIndex 491576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada && differentCharIndex < currentPtNode.mChars.length) break; 492576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null == currentPtNode.mChildren) break; 493576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada charIndex += currentPtNode.mChars.length; 494bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex >= word.length) break; 495576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentNodeArray = currentPtNode.mChildren; 496af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); 497bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 498bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 49993445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 500bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // No node at this point to accept the word. Create one. 501af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); 502576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), 50372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 504576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentNodeArray.mData.add(insertionIndex, newPtNode); 505af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (DBG) checkStack(currentNodeArray); 506bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 507bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // There is a word with a common prefix. 508576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (differentCharIndex == currentPtNode.mChars.length) { 509bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 510bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word is a prefix of an existing word, but the node on which it 511576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // should end already exists as is. Since the old PtNode was not a terminal, 5127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // make it one by filling in its frequency and other attributes 513576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.update(frequency, shortcutTargets, null, isNotAWord, 51472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 515bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 516bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word matches the full old word and extends past it. 517bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We only have to create a new node and add it to the end of this. 518576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newNode = new PtNode( 519bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 52072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, isNotAWord, 52172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 522576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren = new PtNodeArray(); 523576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren.mData.add(newNode); 524bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 525bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 526bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (0 == differentCharIndex) { 5277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // Exact same word. Update the frequency if higher. This will also add the 52844c64f46a143623dd793facd889c8d6eab5e230cJean Chalard // new shortcuts to the existing shortcut list if it already exists. 529576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.update(frequency, shortcutTargets, null, 530576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord && isNotAWord, 531576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsBlacklistEntry || isBlacklistEntry); 532bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 533bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Partial prefix match only. We have to replace the current node with a node 534bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // containing the current prefix and create two new ones for the tails. 535af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray newChildren = new PtNodeArray(); 536576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newOldWord = new PtNode( 537576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, 538576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChars.length), currentPtNode.mShortcutTargets, 539576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mBigrams, currentPtNode.mFrequency, 540576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, 541576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren); 542bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(newOldWord); 543bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 544576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newParent; 545bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 546576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada newParent = new PtNode( 547576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 54872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 54972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry, newChildren); 550bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 551576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada newParent = new PtNode( 552576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 55345239029ceb876462e0d3f654c6b24ac9a9ed8afKen Wakasa null /* shortcutTargets */, null /* bigrams */, -1, 55472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 555576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newWord = new PtNode(Arrays.copyOfRange(word, 55644c64f46a143623dd793facd889c8d6eab5e230cJean Chalard charIndex + differentCharIndex, word.length), 55772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard shortcutTargets, null /* bigrams */, frequency, 55872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry); 559bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int addIndex = word[charIndex + differentCharIndex] 560576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada > currentPtNode.mChars[differentCharIndex] ? 1 : 0; 561bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(addIndex, newWord); 562bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 563af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard currentNodeArray.mData.set(nodeIndex, newParent); 564bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 565af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (DBG) checkStack(currentNodeArray); 566bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 567bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 568bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 569bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 57093ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka private static int ARRAYS_ARE_EQUAL = 0; 57193ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka 572bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 573bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Custom comparison of two int arrays taken to contain character codes. 574bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 575bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method compares the two arrays passed as an argument in a lexicographic way, 576bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * with an offset in the dst string. 577bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method does NOT test for the first character. It is taken to be equal. 578bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 579bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 580bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * strings are equal. This works BECAUSE we don't look at the first character. 581bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 582bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param src the left-hand side string of the comparison. 583bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dst the right-hand side string of the comparison. 584bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dstOffset the offset in the right-hand side string. 585bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 586bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 587af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { 588bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We do NOT test the first char, because we come from a method that already 589bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tested it. 590bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 1; i < src.length; ++i) { 591bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dstOffset + i >= dst.length) return i; 592bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (src[i] != dst[dstOffset + i]) return i; 593bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 594bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dst.length > src.length) return src.length; 595bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return ARRAYS_ARE_EQUAL; 596bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 597bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 598bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 599576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Helper class that compares and sorts two PtNodes according to their 600bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * first element only. I repeat: ONLY the first element is considered, the rest 601bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * is ignored. 602bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This comparator imposes orderings that are inconsistent with equals. 603bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 604576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static private final class PtNodeComparator implements java.util.Comparator<PtNode> { 60593ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka @Override 606576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public int compare(PtNode p1, PtNode p2) { 607576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (p1.mChars[0] == p2.mChars[0]) return 0; 608576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return p1.mChars[0] < p2.mChars[0] ? -1 : 1; 609bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 610bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 611576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); 612bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 613bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 614af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Finds the insertion index of a character within a node array. 615bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 616af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { 617576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final ArrayList<PtNode> data = nodeArray.mData; 618576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode reference = new PtNode(new int[] { character }, 61972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 62072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 621576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); 622bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return result >= 0 ? result : -result - 1; 623bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 624bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 625bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 626af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Find the index of a char in a node array, if it exists. 627bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 628af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the node array to search in. 629bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param character the character to search for. 63093445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 631bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 632af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { 633af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int insertionIndex = findInsertionIndex(nodeArray, character); 634af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 635af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex 63693445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard : CHARACTER_NOT_FOUND_INDEX; 637bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 638bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 639bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 640bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to find a word in a given branch. 641bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 6422da886651874b2588f18f800417ba858ac93d88bJean Chalard @SuppressWarnings("unused") 643576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { 644bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int index = 0; 64512efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard final StringBuilder checker = DBG ? new StringBuilder() : null; 646a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard final int[] codePoints = getCodePoints(string); 647bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 648576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode currentPtNode; 649bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 650af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); 65193445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 652576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode = nodeArray.mData.get(indexOfGroup); 65366f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 654576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (codePoints.length - index < currentPtNode.mChars.length) return null; 65566f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada int newIndex = index; 656576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { 657576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; 65866f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada newIndex++; 65966f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada } 66066f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada index = newIndex; 66166f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 662576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (DBG) { 663576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); 664576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada } 665a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) { 666576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada nodeArray = currentPtNode.mChildren; 667bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 668af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard } while (null != nodeArray && index < codePoints.length); 669bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 670a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) return null; 671576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (!currentPtNode.isTerminal()) return null; 672ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (DBG && !string.equals(checker.toString())) return null; 673576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return currentPtNode; 674bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 675bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 676bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 677eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard * Helper method to find out whether a word is in the dict or not. 678eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard */ 679eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard public boolean hasWord(final String s) { 680eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard if (null == s || "".equals(s)) { 681eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard throw new RuntimeException("Can't search for a null or empty string"); 682eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 683af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return null != findWordInTree(mRootNodeArray, s); 684eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 685eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard 686eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard /** 687576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Recursively count the number of PtNodes in a given branch of the trie. 688bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 689af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the parent node. 690576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the number of PtNodes in all the branch under this node. 691bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 692576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static int countPtNodes(final PtNodeArray nodeArray) { 693af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int nodeSize = nodeArray.mData.size(); 694bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = nodeSize; 695bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = nodeSize - 1; i >= 0; --i) { 696576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 697576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mChildren) 698576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += countPtNodes(ptNode.mChildren); 699bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 700bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 701bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 702bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 703bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 704bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Recursively count the number of nodes in a given branch of the trie. 705bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 706af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the node array to count. 70720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return the number of nodes in this branch. 708bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 709af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public static int countNodeArrays(final PtNodeArray nodeArray) { 710bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = 1; 711af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { 712576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 713576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mChildren) 714576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += countNodeArrays(ptNode.mChildren); 715bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 716bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 717bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 718bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 71920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // Recursively find out whether there are any bigrams. 72020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // This can be pretty expensive especially if there aren't any (we return as soon 72120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // as we find one, so it's much cheaper if there are bigrams) 722af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static boolean hasBigramsInternal(final PtNodeArray nodeArray) { 723af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (null == nodeArray) return false; 724af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { 725576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 726576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mBigrams) return true; 727576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (hasBigramsInternal(ptNode.mChildren)) return true; 72820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 72920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard return false; 73020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 73120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 73220a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard /** 73320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * Finds out whether there are any bigrams in this dictionary. 73420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * 73520a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard * @return true if there is any bigram, false otherwise. 73620a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard */ 73720a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // TODO: this is expensive especially for large dictionaries without any bigram. 73820a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // The up side is, this is always accurate and correct and uses no memory. We should 73920a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // find a more efficient way of doing this, without compromising too much on memory 74020a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard // and ease of use. 74120a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard public boolean hasBigrams() { 742af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return hasBigramsInternal(mRootNodeArray); 74320a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard } 74420a6dea1cabfd8822824f7dca828d898e5b91cbcJean Chalard 745bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Historically, the tails of the words were going to be merged to save space. 746bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // However, that would prevent the code to search for a specific address in log(n) 747bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // time so this was abandoned. 748bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The code is still of interest as it does add some compression to any dictionary 749bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // that has no need for attributes. Implementations that does not read attributes should be 750bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // able to read a dictionary with merged tails. 751bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Also, the following code does support frequencies, as in, it will only merges 752bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that share the same frequency. Though it would result in the above loss of 753bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // performance while searching by address, it is still technically possible to merge 754bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tails that contain attributes, but this code does not take that into account - it does 755bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // not compare attributes and will merge terminals with different attributes regardless. 756bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void mergeTails() { 757bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard MakedictLog.i("Do not merge tails"); 758bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return; 759bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 760576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root)); 761576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// MakedictLog.i("Number of PtNodes : " + countPtNodes(root)); 762bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 763af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// final HashMap<String, ArrayList<PtNodeArray>> repository = 764af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// new HashMap<String, ArrayList<PtNodeArray>>(); 765bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// mergeTailsInner(repository, root); 766bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// 767bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of different pseudohashes : " + repository.size()); 768bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// int size = 0; 769af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// for (ArrayList<PtNodeArray> a : repository.values()) { 770bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// size += a.size(); 771bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 772bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Number of nodes after merge : " + (1 + size)); 773bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 774bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 775bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 776bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The following methods are used by the deactivated mergeTails() 777af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// private static boolean isEqual(PtNodeArray a, PtNodeArray b) { 778bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a && null == b) return true; 779bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == a || null == b) return false; 780bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (a.data.size() != b.data.size()) return false; 781bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int size = a.data.size(); 782bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = size - 1; i >= 0; --i) { 783576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// PtNode aPtNode = a.data.get(i); 784576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// PtNode bPtNode = b.data.get(i); 785576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (aPtNode.frequency != bPtNode.frequency) return false; 786576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (aPtNode.alternates == null && bPtNode.alternates != null) return false; 787576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false; 788576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false; 789576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (!isEqual(aPtNode.children, bPtNode.children)) return false; 790bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 791bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return true; 792bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 793bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 794af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner( 795af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) { 796576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// final ArrayList<PtNode> branches = nodeArray.data; 797bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// final int nodeSize = branches.size(); 798bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// for (int i = 0; i < nodeSize; ++i) { 799576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// PtNode ptNode = branches.get(i); 800576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (null != ptNode.children) { 801576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// String pseudoHash = getPseudoHash(ptNode.children); 802af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// ArrayList<PtNodeArray> similarList = map.get(pseudoHash); 803bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (null == similarList) { 804af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// similarList = new ArrayList<PtNodeArray>(); 805bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// map.put(pseudoHash, similarList); 806bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 807bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// boolean merged = false; 808af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// for (PtNodeArray similar : similarList) { 809576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// if (isEqual(ptNode.children, similar)) { 810576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// ptNode.children = similar; 811bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// merged = true; 812bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// break; 813bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 814bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 815bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// if (!merged) { 816576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// similarList.add(ptNode.children); 817bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 818576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// mergeTailsInner(map, ptNode.children); 819bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 820bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 821bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return map; 822bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 823bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 824af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard// private static String getPseudoHash(final PtNodeArray nodeArray) { 825bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// StringBuilder s = new StringBuilder(); 826576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// for (PtNode ptNode : nodeArray.data) { 827576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// s.append(ptNode.frequency); 828576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada// for (int ch : ptNode.chars) { 829bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// s.append(Character.toChars(ch)); 830bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 831bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 832bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// return s.toString(); 833bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard// } 834bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 835bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 836bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Iterator to walk through a dictionary. 837bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 838bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is purely for convenience. 839bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 840a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class DictionaryIterator implements Iterator<Word> { 841a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka private static final class Position { 842576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public Iterator<PtNode> pos; 843bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public int length; 844576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public Position(ArrayList<PtNode> ptNodes) { 845576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada pos = ptNodes.iterator(); 846bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard length = 0; 847bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 848bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 849bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final StringBuilder mCurrentString; 850bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final LinkedList<Position> mPositions; 851bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 852576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public DictionaryIterator(ArrayList<PtNode> ptRoot) { 853bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString = new StringBuilder(); 854bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions = new LinkedList<Position>(); 855576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final Position rootPos = new Position(ptRoot); 856bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.add(rootPos); 857bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 858bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 859bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 860bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasNext() { 861bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (Position p : mPositions) { 862bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (p.pos.hasNext()) { 863bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return true; 864bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 865bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 866bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return false; 867bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 868bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 869bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 870bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Word next() { 871bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Position currentPos = mPositions.getLast(); 872a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(currentPos.length); 873bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 874bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 875bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentPos.pos.hasNext()) { 876576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode currentPtNode = currentPos.pos.next(); 877a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard currentPos.length = mCurrentString.length(); 878576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (int i : currentPtNode.mChars) { 879bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString.append(Character.toChars(i)); 880ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 881576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != currentPtNode.mChildren) { 882576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPos = new Position(currentPtNode.mChildren.mData); 883ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard currentPos.length = mCurrentString.length(); 884bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.addLast(currentPos); 885bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 886576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (currentPtNode.mFrequency >= 0) { 887576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return new Word(mCurrentString.toString(), currentPtNode.mFrequency, 888576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mShortcutTargets, currentPtNode.mBigrams, 889576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); 890ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 891bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 892bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.removeLast(); 893bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = mPositions.getLast(); 894a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(mPositions.getLast().length); 895bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 896f2789819bd005b5b0581e8439601b5501306327dKen Wakasa } while (true); 897bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 898bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 899bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 900bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void remove() { 901bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new UnsupportedOperationException("Unsupported yet"); 902bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 903bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 904bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 905bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 906bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 907bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Method to return an iterator. 908bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 909bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 910bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * and say : for (Word w : x) {} 911bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 912bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 913bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public Iterator<Word> iterator() { 914af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return new DictionaryIterator(mRootNodeArray.mData); 915bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 916bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard} 917