1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16package com.android.providers.contacts.aggregation.util;
17
18import java.util.Arrays;
19
20/**
21 * A string distance calculator, particularly suited for name matching.
22* <p>
23 * A detailed discussion of the topic of record linkage in general and name matching
24 * in particular can be found in this article:
25 * <blockquote>
26 * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions".
27 * Research Report Series, RRS.
28 * </blockquote>
29 */
30public class NameDistance {
31
32    private static final float WINKLER_BONUS_THRESHOLD = 0.7f;
33    private static final int MIN_EXACT_PREFIX_LENGTH = 3;
34
35    private final int mMaxLength;
36    private final boolean mPrefixOnly;
37    private final boolean[] mMatchFlags1;
38    private final boolean[] mMatchFlags2;
39
40    /**
41     * Constructor.
42     *
43     * @param maxLength byte arrays are truncated if longer than this number
44     */
45    public NameDistance(int maxLength) {
46        mMaxLength = maxLength;
47        mPrefixOnly = false;
48        mMatchFlags1 = new boolean[maxLength];
49        mMatchFlags2 = new boolean[maxLength];
50    }
51
52    /**
53     * Constructor for a matcher that only checks if one string is the exact prefix of the other
54     */
55    public NameDistance() {
56        mPrefixOnly = true;
57        mMaxLength = 0;
58        mMatchFlags1 = mMatchFlags2 = null;
59    }
60
61    /**
62     * Computes a string distance between two normalized strings passed as byte arrays.
63     */
64    public float getDistance(byte bytes1[], byte bytes2[]) {
65        byte[] array1, array2;
66
67        if (bytes1.length > bytes2.length) {
68            array2 = bytes1;
69            array1 = bytes2;
70        } else {
71            array2 = bytes2;
72            array1 = bytes1;
73        }
74
75        int length1 = array1.length;
76        if (length1 >= MIN_EXACT_PREFIX_LENGTH) {
77            boolean prefix = true;
78            for (int i = 0; i < array1.length; i++) {
79                if (array1[i] != array2[i]) {
80                    prefix = false;
81                    break;
82                }
83            }
84            if (prefix) {
85                return 1.0f;
86            }
87        }
88
89        if (mPrefixOnly) {
90            return 0;
91        }
92
93        if (length1 > mMaxLength) {
94            length1 = mMaxLength;
95        }
96
97        int length2 = array2.length;
98        if (length2 > mMaxLength) {
99            length2 = mMaxLength;
100        }
101
102        Arrays.fill(mMatchFlags1, 0, length1, false);
103        Arrays.fill(mMatchFlags2, 0, length2, false);
104
105        int range = length2 / 2 - 1;
106        if (range < 0) {
107            range = 0;
108        }
109
110        int matches = 0;
111        for (int i = 0; i < length1; i++) {
112            byte c1 = array1[i];
113
114            int from = i - range;
115            if (from < 0) {
116                from = 0;
117            }
118
119            int to = i + range + 1;
120            if (to > length2) {
121                to = length2;
122            }
123
124            for (int j = from; j < to; j++) {
125                if (!mMatchFlags2[j] && c1 == array2[j]) {
126                    mMatchFlags1[i] = mMatchFlags2[j] = true;
127                    matches++;
128                    break;
129                }
130            }
131        }
132
133        if (matches == 0) {
134            return 0f;
135        }
136
137        int transpositions = 0;
138        int j = 0;
139        for (int i = 0; i < length1; i++) {
140            if (mMatchFlags1[i]) {
141                while (!mMatchFlags2[j]) {
142                    j++;
143                }
144                if (array1[i] != array2[j]) {
145                    transpositions++;
146                }
147                j++;
148            }
149        }
150
151        float m = matches;
152        float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3;
153
154        if (jaro < WINKLER_BONUS_THRESHOLD) {
155            return jaro;
156        }
157
158        // Add Winkler bonus
159        int prefix = 0;
160        for (int i = 0; i < length1; i++) {
161            if (bytes1[i] != bytes2[i]) {
162                break;
163            }
164            prefix++;
165        }
166
167        return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro);
168    }
169}
170