1bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/* 2bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Copyright (C) 2011 The Android Open Source Project 3bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 7bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 9bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 10bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 15bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 16bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 17cdc51fc6afc7fd374c5c9eeb0539cae5cf1de724Tom Ouyangpackage com.android.inputmethod.latin.makedict; 18bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 19b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasaimport com.android.inputmethod.annotations.UsedForTesting; 20eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanadaimport com.android.inputmethod.latin.Constants; 213ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagiimport com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 22eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 23bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.ArrayList; 24bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Arrays; 25bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Collections; 26bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.Iterator; 27bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalardimport java.util.LinkedList; 28bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 29bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard/** 30bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * A dictionary that can fusion heads and tails of words for more compression. 31bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 32b6ca354431367b625daf9fff5fbe4b1f5ef996abKen Wakasa@UsedForTesting 335f5feeba13f6f1a907d90365d8037a361d0ff5daKeisuke Kuroyanagipublic final class FusionDictionary implements Iterable<WordProperty> { 3447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard private static final boolean DBG = MakedictLog.DBG; 3547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard 3693445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard private static int CHARACTER_NOT_FOUND_INDEX = -1; 3793445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard 38bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 39576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * A node array of the dictionary, containing several PtNodes. 40bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 41576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the 42bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * real information. 43bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This class also contains fields to cache size and address, to help with binary 44bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * generation. 45bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 46af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public static final class PtNodeArray { 47576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ArrayList<PtNode> mData; 48bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // To help with binary generation 49e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedSize = Integer.MIN_VALUE; 5091cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 5191cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // always hold the same value except between dictionary address compression, during which 5291cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the update process needs to know about both values at the same time. Updating will 5391cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // update the AfterUpdate value, and the code will move them to BeforeUpdate before 5491cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard // the next update pass. 5591cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 5691cbe3566d21e1ae11c1409ae62c0c0a9614dec6Jean Chalard int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 57e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada int mCachedParentAddress = 0; 58e55b644aefbefb4ac79308c9a59116e69a9c53a2Yuichiro Hanada 59af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public PtNodeArray() { 60a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka mData = new ArrayList<>(); 61bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 62576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNodeArray(ArrayList<PtNode> data) { 6326bd46095a05843e7574dfcf7db53406f215525dKeisuke Kuroyanagi Collections.sort(data, PTNODE_COMPARATOR); 64bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mData = data; 65bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 66bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 67bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 68bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 698ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * PtNode is a group of characters, with probability information, shortcut targets, bigrams, 708ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * and children (Pt means Patricia Trie). 71bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 72576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This is the central class of the in-memory representation. A PtNode is what can 73bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * be seen as a traditional "trie node", except it can hold several characters at the 74576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * same time. A PtNode essentially represents one or several characters in the middle 75af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * of the trie tree; as such, it can be a terminal, and it can have children. 76576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * In this in-memory representation, whether the PtNode is a terminal or not is represented 778ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the 788ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null 798ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present, 808ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * are non-null. 81bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 82576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static final class PtNode { 838ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi private static final int NOT_A_TERMINAL = -1; 84bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int mChars[]; 857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mShortcutTargets; 867cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang ArrayList<WeightedString> mBigrams; 878ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi // null == mProbabilityInfo indicates this is not a terminal. 888ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi ProbabilityInfo mProbabilityInfo; 89a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. 90af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray mChildren; 9172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsNotAWord; // Only a shortcut 9272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard boolean mIsBlacklistEntry; 9325de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 9425de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // generation. Before and After always hold the same value except during dictionary 9525de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // address compression, where the update process needs to know about both values at the 9625de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // same time. Updating will update the AfterUpdate value, and the code will move them 9725de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // to BeforeUpdate before the next update pass. 9825de86a6a26232964937872f203f065dd9f9faa4Jean Chalard // The update process does not need two versions of mCachedSize. 99576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedSize; // The size, in bytes, of this PtNode. 100576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) 101576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int mCachedAddressAfterUpdate; // The address of this PtNode (after update) 102bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 103576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 1048ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, 10572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 106bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 1078ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi mProbabilityInfo = probabilityInfo; 1088ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability; 109eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 110bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 111bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = null; 11272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 11372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 114bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 115bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 116576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 1178ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, 118af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final boolean isNotAWord, final boolean isBlacklistEntry, 119af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final PtNodeArray children) { 120bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChars = chars; 1218ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi mProbabilityInfo = probabilityInfo; 122eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard mShortcutTargets = shortcutTargets; 123bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mBigrams = bigrams; 124bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren = children; 12572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 12672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 127bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 128bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 129576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public void addChild(PtNode n) { 130bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (null == mChildren) { 131af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard mChildren = new PtNodeArray(); 132bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 133bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mChildren.mData.add(n); 134bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 135bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 136a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada public int getTerminalId() { 137a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada return mTerminalId; 138a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada } 139a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada 140bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean isTerminal() { 1418ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi return mProbabilityInfo != null; 142bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 143bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 1448ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi public int getProbability() { 1458ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi if (isTerminal()) { 1468ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi return mProbabilityInfo.mProbability; 1478ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi } else { 1488ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi return NOT_A_TERMINAL; 1498ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi } 150b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard } 151b3c98901c5fc1460b54cdf27d74405f27c88e74bJean Chalard 152a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsNotAWord() { 153a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsNotAWord; 154a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 155a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 156a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public boolean getIsBlacklistEntry() { 157a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return mIsBlacklistEntry; 158a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 159a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 160a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getShortcutTargets() { 161a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 162a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mShortcutTargets) return null; 163f41389a74b02a01f7383b1a872db5fa65e81fa1eJean Chalard final ArrayList<WeightedString> copyOfShortcutTargets = 164a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka new ArrayList<>(mShortcutTargets); 165a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfShortcutTargets; 166a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 167a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 168a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard public ArrayList<WeightedString> getBigrams() { 169a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard // We don't want write permission to escape outside the package, so we return a copy 170a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard if (null == mBigrams) return null; 171a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams); 172a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard return copyOfBigrams; 173a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard } 174a23e3330798a3ade6d2f4f5a94b71746feb1b948Jean Chalard 175bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasSeveralChars() { 176bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard assert(mChars.length > 0); 177bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return 1 < mChars.length; 178bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 1797cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 1807cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 1818ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * Adds a word to the bigram list. Updates the probability information if the word already 1827cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * exists. 1837cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 1848ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi public void addBigram(final String word, final ProbabilityInfo probabilityInfo) { 1857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 186a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka mBigrams = new ArrayList<>(); 1877cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 1887cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = getBigram(word); 1897cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram != null) { 1908ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi bigram.mProbabilityInfo = probabilityInfo; 1917cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 1928ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi bigram = new WeightedString(word, probabilityInfo); 1937cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 1947cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 1957cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 1967cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 1977cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 1987cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the shortcut target for the given word. Returns null if the word is not in the 1997cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * shortcut list. 2007cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2017cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getShortcut(final String word) { 20247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2037cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets != null) { 2047cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mShortcutTargets.size(); 2057cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2067cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString shortcut = mShortcutTargets.get(i); 2077cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcut.mWord.equals(word)) { 2087cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return shortcut; 2097cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2107cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2117cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2127cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2137cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2147cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2157cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 2167cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Gets the bigram for the given word. 2177cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Returns null if the word is not in the bigrams list. 2187cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2197cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang public WeightedString getBigram(final String word) { 22047db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // TODO: Don't do a linear search 2217cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams != null) { 2227cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = mBigrams.size(); 2237cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2247cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang WeightedString bigram = mBigrams.get(i); 2257cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigram.mWord.equals(word)) { 2267cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return bigram; 2277cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2287cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2297cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2307cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang return null; 2317cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2327cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 2337cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 234576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to 2357cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 2367cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * updated if they are higher than the existing ones. 2377cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 2388ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi private void update(final ProbabilityInfo probabilityInfo, 2398ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi final ArrayList<WeightedString> shortcutTargets, 24072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> bigrams, 24172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 2428ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo); 2437cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (shortcutTargets != null) { 2447cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mShortcutTargets == null) { 2457cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets = shortcutTargets; 2467cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2477cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = shortcutTargets.size(); 2487cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2497cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString shortcut = shortcutTargets.get(i); 2507cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingShortcut = getShortcut(shortcut.mWord); 2517cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingShortcut == null) { 2527cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mShortcutTargets.add(shortcut); 2538ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi } else { 2548ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi existingShortcut.mProbabilityInfo = ProbabilityInfo.max( 2558ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo); 2567cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2577cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2597cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2607cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (bigrams != null) { 2617cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (mBigrams == null) { 2627cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams = bigrams; 2637cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 2647cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final int size = bigrams.size(); 2657cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang for (int i = 0; i < size; ++i) { 2667cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString bigram = bigrams.get(i); 2677cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang final WeightedString existingBigram = getBigram(bigram.mWord); 2687cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang if (existingBigram == null) { 2697cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang mBigrams.add(bigram); 2708ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi } else { 2718ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi existingBigram.mProbabilityInfo = ProbabilityInfo.max( 2728ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi existingBigram.mProbabilityInfo, bigram.mProbabilityInfo); 2737cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2747cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2757cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 2767cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 27772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsNotAWord = isNotAWord; 27872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard mIsBlacklistEntry = isBlacklistEntry; 2797cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 280bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 281bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 282bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public final DictionaryOptions mOptions; 283af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public final PtNodeArray mRootNodeArray; 284bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 285af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { 286af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard mRootNodeArray = rootNodeArray; 287bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mOptions = options; 288bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 289bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 290c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard public void addOptionAttribute(final String key, final String value) { 291c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard mOptions.mAttributes.put(key, value); 292c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard } 293c734c2aca1830643d169fd292e0c9d4d9306af5aJean Chalard 294bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 295bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to convert a String to an int array. 296bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 2973c6d9fe14840fd2c455ec65b6481ed78f99a5460Yuichiro Hanada static int[] getCodePoints(final String word) { 298ca9c3c06137af878607f16573585c72041a4b7bfJean Chalard // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, 29947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard // which is not visible from the makedict package. Factor this code. 300ca9c3c06137af878607f16573585c72041a4b7bfJean Chalard final int length = word.length(); 301ca9c3c06137af878607f16573585c72041a4b7bfJean Chalard if (length <= 0) return new int[] {}; 30247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final char[] characters = word.toCharArray(); 30347db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 30447db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int codePoint = Character.codePointAt(characters, 0); 30547db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard int dsti = 0; 30647db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard for (int srci = Character.charCount(codePoint); 30747db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard srci < length; srci += Character.charCount(codePoint), ++dsti) { 30847db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 30947db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoint = Character.codePointAt(characters, srci); 310c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 31147db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard codePoints[dsti] = codePoint; 31247db0be7cbdb8abafc18c1e49b71f6dac0d46994Jean Chalard return codePoints; 313c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard } 314c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard 315c599f2e9d6ab839f38183aa178684ff0e94178a3Jean Chalard /** 316bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to add a word as a string. 317bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 318bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method adds a word to the dictionary with the given frequency. Optional 319bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * lists of bigrams and shortcuts can be passed here. For each word inside, 320bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * they will be added to the dictionary as necessary. 321bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 322bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word to add. 3238ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * @param probabilityInfo probability information of the word. 324eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 32572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 326bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 3278ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi public void add(final String word, final ProbabilityInfo probabilityInfo, 32872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 3298ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord, 33072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 33172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard } 33272b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard 33372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard /** 33472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * Helper method to add a blacklist entry as a string. 33572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * 33672b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param word the word to add as a blacklist entry. 33772b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param shortcutTargets a list of shortcut targets for this word, or null. 33872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 33972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard */ 34072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard public void addBlacklistEntry(final String word, 34172b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 3428ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord, 3438ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi true /* isBlacklistEntry */); 344bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 345bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 346bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 347576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Sanity check for a PtNode array. 348bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 349576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This method checks that all PtNodes in a node array are ordered as expected. 350bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * If they are, nothing happens. If they aren't, an exception is thrown. 351bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 352576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private void checkStack(PtNodeArray ptNodeArray) { 353576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ArrayList<PtNode> stack = ptNodeArray.mData; 354bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int lastValue = -1; 355bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 0; i < stack.size(); ++i) { 356bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int currentValue = stack.get(i).mChars[0]; 357bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentValue <= lastValue) 358bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new RuntimeException("Invalid stack"); 359bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard else 360bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard lastValue = currentValue; 361bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 362bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 363bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 364bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 3657cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * Helper method to add a new bigram to the dictionary. 3667cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang * 367ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi * @param word0 the previous word of the context 368ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi * @param word1 the next word of the context 3698ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * @param probabilityInfo the bigram probability info 3707cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang */ 3718ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi public void setBigram(final String word0, final String word1, 3728ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi final ProbabilityInfo probabilityInfo) { 373ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi PtNode ptNode0 = findWordInTree(mRootNodeArray, word0); 374ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi if (ptNode0 != null) { 375ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1); 376ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi if (ptNode1 == null) { 3778ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */, 37872b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard false /* isBlacklistEntry */); 379576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // The PtNode for the first word may have moved by the above insertion, 380ddb0bcc051c1e9f2706f3702f0bb3135e4352f7bJean Chalard // if word1 and word2 share a common stem that happens not to have been 381576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // a cutting point until now. In this case, we need to refresh ptNode. 382ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi ptNode0 = findWordInTree(mRootNodeArray, word0); 3837cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 3848ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi ptNode0.addBigram(word1, probabilityInfo); 3857cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } else { 386ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi throw new RuntimeException("First word of bigram not found " + word0); 3877cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 3887cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang } 3897cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang 3907cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang /** 391bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Add a word to this dictionary. 392bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 39344c64f46a143623dd793facd889c8d6eab5e230cJean Chalard * The shortcuts, if any, have to be in the dictionary already. If they aren't, 394bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * an exception is thrown. 395bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 396bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param word the word, as an int array. 3978ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi * @param probabilityInfo the probability information of the word. 398eec2e51e2cbc9e69739187557846a439ed74325eJean Chalard * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 39972b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 40072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 401bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 4028ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi private void add(final int[] word, final ProbabilityInfo probabilityInfo, 40372b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final ArrayList<WeightedString> shortcutTargets, 40472b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard final boolean isNotAWord, final boolean isBlacklistEntry) { 4058ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY); 406ffcbbaf12788a9fc9398607a548e552d7d2bf05eSatoshi Kataoka if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { 407eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 408eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada return; 409eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada } 410eae7b293e4a854819aa0de663066cd0b6cdd52e7Yuichiro Hanada 411af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray currentNodeArray = mRootNodeArray; 412bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int charIndex = 0; 413bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 414576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode currentPtNode = null; 415bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int differentCharIndex = 0; // Set by the loop to the index of the char that differs 416af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); 41793445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 418576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode = currentNodeArray.mData.get(nodeIndex); 419576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); 420bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (ARRAYS_ARE_EQUAL != differentCharIndex 421576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada && differentCharIndex < currentPtNode.mChars.length) break; 422576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null == currentPtNode.mChildren) break; 423576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada charIndex += currentPtNode.mChars.length; 424bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex >= word.length) break; 425576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentNodeArray = currentPtNode.mChildren; 426af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); 427bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 428bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 42993445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 430bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // No node at this point to accept the word. Create one. 431af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); 432576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), 4338ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord, 4348ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi isBlacklistEntry); 435576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentNodeArray.mData.add(insertionIndex, newPtNode); 436af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (DBG) checkStack(currentNodeArray); 437bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 438bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // There is a word with a common prefix. 439576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (differentCharIndex == currentPtNode.mChars.length) { 440bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 441bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word is a prefix of an existing word, but the node on which it 442576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // should end already exists as is. Since the old PtNode was not a terminal, 4437cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // make it one by filling in its frequency and other attributes 4448ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord, 44572b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isBlacklistEntry); 446bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 447bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // The new word matches the full old word and extends past it. 448bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We only have to create a new node and add it to the end of this. 449576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newNode = new PtNode( 450bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 4518ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi shortcutTargets, null /* bigrams */, probabilityInfo, 4528ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi isNotAWord, isBlacklistEntry); 453576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren = new PtNodeArray(); 454576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren.mData.add(newNode); 455bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 456bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 457bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (0 == differentCharIndex) { 4587cfe20efbeb4a94b15291aee95d0559ae2449c45Tom Ouyang // Exact same word. Update the frequency if higher. This will also add the 45944c64f46a143623dd793facd889c8d6eab5e230cJean Chalard // new shortcuts to the existing shortcut list if it already exists. 4608ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi currentPtNode.update(probabilityInfo, shortcutTargets, null, 461576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord && isNotAWord, 462576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsBlacklistEntry || isBlacklistEntry); 463bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 464bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // Partial prefix match only. We have to replace the current node with a node 465bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // containing the current prefix and create two new ones for the tails. 466af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard PtNodeArray newChildren = new PtNodeArray(); 467576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newOldWord = new PtNode( 468576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, 469576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChars.length), currentPtNode.mShortcutTargets, 4708ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi currentPtNode.mBigrams, currentPtNode.mProbabilityInfo, 471576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, 472576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mChildren); 473bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(newOldWord); 474bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 475576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newParent; 476bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (charIndex + differentCharIndex >= word.length) { 477576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada newParent = new PtNode( 478576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 4798ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi shortcutTargets, null /* bigrams */, probabilityInfo, 48072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry, newChildren); 481bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 482576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada newParent = new PtNode( 483576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 4848ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi null /* shortcutTargets */, null /* bigrams */, 4858ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi null /* probabilityInfo */, false /* isNotAWord */, 4868ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi false /* isBlacklistEntry */, newChildren); 487576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode newWord = new PtNode(Arrays.copyOfRange(word, 48844c64f46a143623dd793facd889c8d6eab5e230cJean Chalard charIndex + differentCharIndex, word.length), 4898ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi shortcutTargets, null /* bigrams */, probabilityInfo, 49072b1c9394105b6fbc0d8c6ff00f3574ee37a9aaaJean Chalard isNotAWord, isBlacklistEntry); 491bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final int addIndex = word[charIndex + differentCharIndex] 492576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada > currentPtNode.mChars[differentCharIndex] ? 1 : 0; 493bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard newChildren.mData.add(addIndex, newWord); 494bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 495af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard currentNodeArray.mData.set(nodeIndex, newParent); 496bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 497af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (DBG) checkStack(currentNodeArray); 498bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 499bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 500bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 501bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 50293ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka private static int ARRAYS_ARE_EQUAL = 0; 50393ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka 504bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 505bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Custom comparison of two int arrays taken to contain character codes. 506bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 507bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method compares the two arrays passed as an argument in a lexicographic way, 508bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * with an offset in the dst string. 509bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method does NOT test for the first character. It is taken to be equal. 510bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 511bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 512bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * strings are equal. This works BECAUSE we don't look at the first character. 513bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 514bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param src the left-hand side string of the comparison. 515bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dst the right-hand side string of the comparison. 516bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param dstOffset the offset in the right-hand side string. 517bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 518bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 519af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { 520bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // We do NOT test the first char, because we come from a method that already 521bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard // tested it. 522bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = 1; i < src.length; ++i) { 523bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dstOffset + i >= dst.length) return i; 524bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (src[i] != dst[dstOffset + i]) return i; 525bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 526bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (dst.length > src.length) return src.length; 527bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return ARRAYS_ARE_EQUAL; 528bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 529bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 530bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 531576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Helper class that compares and sorts two PtNodes according to their 532bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * first element only. I repeat: ONLY the first element is considered, the rest 533bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * is ignored. 534bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This comparator imposes orderings that are inconsistent with equals. 535bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 536576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static private final class PtNodeComparator implements java.util.Comparator<PtNode> { 53793ebf74bae44728e0d5f7e738ea28376187a876eTadashi G. Takaoka @Override 538576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public int compare(PtNode p1, PtNode p2) { 539576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (p1.mChars[0] == p2.mChars[0]) return 0; 540576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return p1.mChars[0] < p2.mChars[0] ? -1 : 1; 541bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 542bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 543576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); 544bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 545bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 546af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Finds the insertion index of a character within a node array. 547bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 548af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { 549576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final ArrayList<PtNode> data = nodeArray.mData; 550576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode reference = new PtNode(new int[] { character }, 5518ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */, 5528ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi false /* isNotAWord */, false /* isBlacklistEntry */); 553576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); 554bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return result >= 0 ? result : -result - 1; 555bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 556bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 557bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 558af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Find the index of a char in a node array, if it exists. 559bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 560af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the node array to search in. 561bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * @param character the character to search for. 56293445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 563bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 564af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { 565af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int insertionIndex = findInsertionIndex(nodeArray, character); 566af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 567af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex 56893445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard : CHARACTER_NOT_FOUND_INDEX; 569bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 570bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 571bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 572bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Helper method to find a word in a given branch. 573bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 574576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { 575bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int index = 0; 57612efad3d15147f255f6e01600c40e9fdb1224d84Jean Chalard final StringBuilder checker = DBG ? new StringBuilder() : null; 577a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard final int[] codePoints = getCodePoints(string); 578bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 579576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode currentPtNode; 580bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 581af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); 58293445b4821e9e8ecc7dd52f1a5d5316c7eec2654Jean Chalard if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 583576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode = nodeArray.mData.get(indexOfGroup); 58466f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 585576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (codePoints.length - index < currentPtNode.mChars.length) return null; 58666f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada int newIndex = index; 587576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { 588576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; 58966f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada newIndex++; 59066f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada } 59166f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada index = newIndex; 59266f338983bb9cb04a0d94a4729330b1c8ff01c93Yuichiro Hanada 593576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (DBG) { 594576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); 595576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada } 596a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) { 597576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada nodeArray = currentPtNode.mChildren; 598bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 599af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard } while (null != nodeArray && index < codePoints.length); 600bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 601a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard if (index < codePoints.length) return null; 602576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (!currentPtNode.isTerminal()) return null; 603ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard if (DBG && !string.equals(checker.toString())) return null; 604576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return currentPtNode; 605bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 606bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 607bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 608eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard * Helper method to find out whether a word is in the dict or not. 609eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard */ 610eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard public boolean hasWord(final String s) { 611eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard if (null == s || "".equals(s)) { 612eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard throw new RuntimeException("Can't search for a null or empty string"); 613eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 614af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return null != findWordInTree(mRootNodeArray, s); 615eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard } 616eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard 617eddfbb68dcff55c85b3d5b82d406f543bd038825Jean Chalard /** 618576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Recursively count the number of PtNodes in a given branch of the trie. 619bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 620af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the parent node. 621576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the number of PtNodes in all the branch under this node. 622bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 623576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static int countPtNodes(final PtNodeArray nodeArray) { 624af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int nodeSize = nodeArray.mData.size(); 625bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard int size = nodeSize; 626bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (int i = nodeSize - 1; i >= 0; --i) { 627576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada PtNode ptNode = nodeArray.mData.get(i); 628576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mChildren) 629576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += countPtNodes(ptNode.mChildren); 630bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 631bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return size; 632bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 633bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 634bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 635bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Iterator to walk through a dictionary. 636bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 637bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This is purely for convenience. 638bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 6395f5feeba13f6f1a907d90365d8037a361d0ff5daKeisuke Kuroyanagi public static final class DictionaryIterator implements Iterator<WordProperty> { 640a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka private static final class Position { 641576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public Iterator<PtNode> pos; 642bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public int length; 643576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public Position(ArrayList<PtNode> ptNodes) { 644576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada pos = ptNodes.iterator(); 645bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard length = 0; 646bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 647bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 648bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final StringBuilder mCurrentString; 649bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard final LinkedList<Position> mPositions; 650bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 651576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public DictionaryIterator(ArrayList<PtNode> ptRoot) { 652bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString = new StringBuilder(); 653a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka mPositions = new LinkedList<>(); 654576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final Position rootPos = new Position(ptRoot); 655bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.add(rootPos); 656bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 657bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 658bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 659bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public boolean hasNext() { 660bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard for (Position p : mPositions) { 661bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (p.pos.hasNext()) { 662bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return true; 663bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 664bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 665bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard return false; 666bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 667bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 668bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 6695f5feeba13f6f1a907d90365d8037a361d0ff5daKeisuke Kuroyanagi public WordProperty next() { 670bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard Position currentPos = mPositions.getLast(); 671a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(currentPos.length); 672bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 673bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard do { 674bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard if (currentPos.pos.hasNext()) { 675576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode currentPtNode = currentPos.pos.next(); 676a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard currentPos.length = mCurrentString.length(); 677576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (int i : currentPtNode.mChars) { 678bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mCurrentString.append(Character.toChars(i)); 679ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 680576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != currentPtNode.mChildren) { 681576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPos = new Position(currentPtNode.mChildren.mData); 682ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard currentPos.length = mCurrentString.length(); 683bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.addLast(currentPos); 684bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 6858ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi if (currentPtNode.isTerminal()) { 6868ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi return new WordProperty(mCurrentString.toString(), 6878ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi currentPtNode.mProbabilityInfo, 688576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mShortcutTargets, currentPtNode.mBigrams, 689576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); 690ca0fdbbe2ec4d282ef14154d6994271d62e6b2baJean Chalard } 691bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } else { 692bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard mPositions.removeLast(); 693bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard currentPos = mPositions.getLast(); 694a411595b169c1f136d09d114a458def1f99f91d9Jean Chalard mCurrentString.setLength(mPositions.getLast().length); 695bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 696f2789819bd005b5b0581e8439601b5501306327dKen Wakasa } while (true); 697bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 698bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 699bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 700bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard public void remove() { 701bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard throw new UnsupportedOperationException("Unsupported yet"); 702bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 703bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 704bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 705bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard 706bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard /** 707bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * Method to return an iterator. 708bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * 709bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 710bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard * and say : for (Word w : x) {} 711bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard */ 712bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard @Override 7135f5feeba13f6f1a907d90365d8037a361d0ff5daKeisuke Kuroyanagi public Iterator<WordProperty> iterator() { 714af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return new DictionaryIterator(mRootNodeArray.mData); 715bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard } 716bfbbee8c5757aef4a20879547c16af0a4d1bf4c7Jean Chalard} 717