1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.quicksearchbox.util;
18
19/**
20 * This class represents the matrix used in the Levenshtein distance algorithm, together
21 * with the algorithm itself which operates on the matrix.
22 *
23 * We also track of the individual operations applied to transform the source string into the
24 * target string so we can trace the path taken through the matrix afterwards, in order to
25 * perform the formatting as required.
26 */
27public class LevenshteinDistance {
28    public static final int EDIT_DELETE = 0;
29    public static final int EDIT_INSERT = 1;
30    public static final int EDIT_REPLACE = 2;
31    public static final int EDIT_UNCHANGED = 3;
32
33    private final Token[] mSource;
34    private final Token[] mTarget;
35    private final int[][] mEditTypeTable;
36    private final int[][] mDistanceTable;
37
38    public LevenshteinDistance(Token[] source, Token[] target) {
39        final int sourceSize = source.length;
40        final int targetSize = target.length;
41        final int[][] editTab = new int[sourceSize+1][targetSize+1];
42        final int[][] distTab = new int[sourceSize+1][targetSize+1];
43        editTab[0][0] = EDIT_UNCHANGED;
44        distTab[0][0] = 0;
45        for (int i = 1; i <= sourceSize; ++i) {
46            editTab[i][0] = EDIT_DELETE;
47            distTab[i][0] = i;
48        }
49        for (int i = 1; i <= targetSize; ++i) {
50            editTab[0][i] = EDIT_INSERT;
51            distTab[0][i] = i;
52        }
53        mEditTypeTable = editTab;
54        mDistanceTable = distTab;
55        mSource = source;
56        mTarget = target;
57    }
58
59    /**
60     * Implementation of Levenshtein distance algorithm.
61     *
62     * @return The Levenshtein distance.
63     */
64    public int calculate() {
65        final Token[] src = mSource;
66        final Token[] trg = mTarget;
67        final int sourceLen = src.length;
68        final int targetLen = trg.length;
69        final int[][] distTab = mDistanceTable;
70        final int[][] editTab = mEditTypeTable;
71        for (int s = 1; s <= sourceLen; ++s) {
72            Token sourceToken = src[s-1];
73            for (int t = 1; t <= targetLen; ++t) {
74                Token targetToken = trg[t-1];
75                int cost = sourceToken.prefixOf(targetToken) ? 0 : 1;
76
77                int distance = distTab[s-1][t] + 1;
78                int type = EDIT_DELETE;
79
80                int d = distTab[s][t - 1];
81                if (d + 1 < distance ) {
82                    distance = d + 1;
83                    type = EDIT_INSERT;
84                }
85
86                d = distTab[s - 1][t - 1];
87                if (d + cost < distance) {
88                    distance = d + cost;
89                    type = cost == 0 ? EDIT_UNCHANGED : EDIT_REPLACE;
90                }
91                distTab[s][t] = distance;
92                editTab[s][t] = type;
93            }
94        }
95        return distTab[sourceLen][targetLen];
96    }
97
98    /**
99     * Gets the list of operations which were applied to each target token; {@link #calculate} must
100     * have been called on this object before using this method.
101     * @return A list of {@link EditOperation}s indicating the origin of each token in the target
102     *      string. The position of the token indicates the position in the source string of the
103     *      token that was unchanged/replaced, or the position in the source after which a target
104     *      token was inserted.
105     */
106    public EditOperation[] getTargetOperations() {
107        final int trgLen = mTarget.length;
108        final EditOperation[] ops = new EditOperation[trgLen];
109        int targetPos = trgLen;
110        int sourcePos = mSource.length;
111        final int[][] editTab = mEditTypeTable;
112        while (targetPos > 0) {
113            int editType = editTab[sourcePos][targetPos];
114            switch (editType) {
115                case LevenshteinDistance.EDIT_DELETE:
116                    sourcePos--;
117                    break;
118                case LevenshteinDistance.EDIT_INSERT:
119                    targetPos--;
120                    ops[targetPos] = new EditOperation(editType, sourcePos);
121                    break;
122                case LevenshteinDistance.EDIT_UNCHANGED:
123                case LevenshteinDistance.EDIT_REPLACE:
124                    targetPos--;
125                    sourcePos--;
126                    ops[targetPos] = new EditOperation(editType, sourcePos);
127                    break;
128            }
129        }
130
131        return ops;
132    }
133
134    public static final class EditOperation {
135        private final int mType;
136        private final int mPosition;
137        public EditOperation(int type, int position) {
138            mType = type;
139            mPosition = position;
140        }
141        public int getType() {
142            return mType;
143        }
144        public int getPosition() {
145            return mPosition;
146        }
147    }
148
149    public static final class Token implements CharSequence {
150        private final char[] mContainer;
151        public final int mStart;
152        public final int mEnd;
153
154        public Token(char[] container, int start, int end) {
155            mContainer = container;
156            mStart = start;
157            mEnd = end;
158        }
159
160        public int length() {
161            return mEnd - mStart;
162        }
163
164        @Override
165        public String toString() {
166            // used in tests only.
167            return subSequence(0, length());
168        }
169
170        public boolean prefixOf(final Token that) {
171            final int len = length();
172            if (len > that.length()) return false;
173            final int thisStart = mStart;
174            final int thatStart = that.mStart;
175            final char[] thisContainer = mContainer;
176            final char[] thatContainer = that.mContainer;
177            for (int i = 0; i < len; ++i) {
178                if (thisContainer[thisStart + i] != thatContainer[thatStart + i]) {
179                    return false;
180                }
181            }
182            return true;
183        }
184
185        public char charAt(int index) {
186            return mContainer[index + mStart];
187        }
188
189        public String subSequence(int start, int end) {
190            return new String(mContainer, mStart + start, length());
191        }
192
193    }
194}
195