LevenshteinDistance.java revision 2fb3a129925a42e72944b836e85a1a2d55a0d950
1/* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.quicksearchbox.util; 18 19import java.util.List; 20 21/** 22 * This class represents the matrix used in the Levenshtein distance algorithm, together 23 * with the algorithm itself which operates on the matrix. 24 * 25 * We also track of the individual operations applied to transform the source string into the 26 * target string so we can trace the path taken through the matrix afterwards, in order to 27 * perform the formatting as required. 28 */ 29public abstract class LevenshteinDistance<T> { 30 public static final int EDIT_DELETE = 0; 31 public static final int EDIT_INSERT = 1; 32 public static final int EDIT_REPLACE = 2; 33 public static final int EDIT_UNCHANGED = 3; 34 35 private final List<T> mSource; 36 private final List<T> mTarget; 37 private final Entry[][] mTable; 38 39 public LevenshteinDistance(List<T> source, List<T> target) { 40 mTable = new Entry[source.size()+1][target.size()+1]; 41 mSource = source; 42 mTarget = target; 43 set(0, 0, EDIT_UNCHANGED, 0); 44 for (int i = 1; i <= source.size(); ++i) { 45 set(i, 0, EDIT_DELETE, i); 46 } 47 for (int i = 1; i <= target.size(); ++i) { 48 set(0, i, EDIT_INSERT, i); 49 } 50 } 51 52 /** 53 * Compares a source and target token. 54 * @param source Token from the source string. 55 * @param target Token from the target string. 56 * @return {@code true} if the two are the same for the purposes of edit distance calculation, 57 * {@code false} otherwise. 58 */ 59 protected abstract boolean match(T source, T target); 60 61 /** 62 * Implementation of Levenshtein distance algorithm. 63 * 64 * @return The Levenshtein distance. 65 */ 66 public int calculate() { 67 for (int s = 1; s <= mSource.size(); ++s) { 68 T sourceToken = mSource.get(s-1); 69 for (int t = 1; t <= mTarget.size(); ++t) { 70 T targetToken = mTarget.get(t-1); 71 int cost = match(sourceToken, targetToken) ? 0 : 1; 72 73 Entry e = get(s - 1, t); 74 int distance = e.getDistance() + 1; 75 int type = EDIT_DELETE; 76 77 e = get(s, t - 1); 78 if (e.getDistance() + 1 < distance ) { 79 distance = e.getDistance() + 1; 80 type = EDIT_INSERT; 81 } 82 83 e = get(s - 1, t - 1); 84 if (e.getDistance() + cost < distance) { 85 distance = e.getDistance() + cost; 86 type = cost == 0 ? EDIT_UNCHANGED : EDIT_REPLACE; 87 } 88 set(s, t, type, distance); 89 } 90 } 91 return get(mSource.size(), mTarget.size()).getDistance(); 92 } 93 94 /** 95 * Gets the list of operations which were applied to each target token; {@link #calculate} must 96 * have been called on this object before using this method. 97 * @return A list of {@link EditOperation}s indicating the origin of each token in the target 98 * string. The position of the token indicates the position in the source string of the 99 * token that was unchanged/replaced, or the position in the source after which a target 100 * token was inserted. 101 */ 102 public EditOperation[] getTargetOperations() { 103 EditOperation[] ops = new EditOperation[mTarget.size()]; 104 int targetPos = mTarget.size(); 105 int sourcePos = mSource.size(); 106 while (targetPos > 0) { 107 Entry e = get(sourcePos, targetPos); 108 int editType = e.getEditType(); 109 switch (editType) { 110 case LevenshteinDistance.EDIT_DELETE: 111 sourcePos--; 112 break; 113 case LevenshteinDistance.EDIT_INSERT: 114 targetPos--; 115 ops[targetPos] = new EditOperation(editType, sourcePos); 116 break; 117 case LevenshteinDistance.EDIT_UNCHANGED: 118 case LevenshteinDistance.EDIT_REPLACE: 119 targetPos--; 120 sourcePos--; 121 ops[targetPos] = new EditOperation(editType, sourcePos); 122 break; 123 } 124 } 125 126 return ops; 127 } 128 129 private void set(int sourceIdx, int targetIdx, int editType, int distance) { 130 mTable[sourceIdx][targetIdx] = new Entry(editType, distance); 131 } 132 133 private Entry get(int sourceIdx, int targetIdx) { 134 Entry e = mTable[sourceIdx][targetIdx]; 135 if (e == null) { 136 throw new NullPointerException("No entry at " + sourceIdx + "," + targetIdx + 137 "; size: " + (mSource.size() + 1) + "," + (mTarget.size() + 1)); 138 } 139 return e; 140 } 141 142 private static class Entry { 143 private final int mEditType; 144 private final int mDistance; 145 public Entry(int editType, int distance) { 146 mEditType = editType; 147 mDistance = distance; 148 } 149 public int getDistance() { 150 return mDistance; 151 } 152 public int getEditType() { 153 return mEditType; 154 } 155 } 156 157 public static class EditOperation { 158 private final int mType; 159 private final int mPosition; 160 public EditOperation(int type, int position) { 161 mType = type; 162 mPosition = position; 163 } 164 public int getType() { 165 return mType; 166 } 167 public int getPosition() { 168 return mPosition; 169 } 170 } 171 172} 173