1cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov/*
2cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Copyright (C) 2009 The Android Open Source Project
3cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov *
4cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Licensed under the Apache License, Version 2.0 (the "License");
5cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * you may not use this file except in compliance with the License.
6cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * You may obtain a copy of the License at
7cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov *
8cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov *      http://www.apache.org/licenses/LICENSE-2.0
9cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov *
10cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Unless required by applicable law or agreed to in writing, software
11cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * distributed under the License is distributed on an "AS IS" BASIS,
12cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * See the License for the specific language governing permissions and
14cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * limitations under the License
15cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov */
1681567f4a0f7c9c338506bd82f4d33e83c2ccf159Makoto Onukipackage com.android.providers.contacts.aggregation.util;
17cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
18cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikovimport java.util.Arrays;
19cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
20cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov/**
21cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * A string distance calculator, particularly suited for name matching.
2228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov* <p>
2328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * A detailed discussion of the topic of record linkage in general and name matching
2428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * in particular can be found in this article:
2528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * <blockquote>
2628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions".
2728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * Research Report Series, RRS.
2828245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * </blockquote>
29cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov */
30a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikovpublic class NameDistance {
31cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
3228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov    private static final float WINKLER_BONUS_THRESHOLD = 0.7f;
33a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov    private static final int MIN_EXACT_PREFIX_LENGTH = 3;
34cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
35cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    private final int mMaxLength;
3628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov    private final boolean mPrefixOnly;
37cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    private final boolean[] mMatchFlags1;
38cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    private final boolean[] mMatchFlags2;
39cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
40cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    /**
41cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov     * Constructor.
42cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov     *
4328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov     * @param maxLength byte arrays are truncated if longer than this number
44cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov     */
45a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov    public NameDistance(int maxLength) {
46cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        mMaxLength = maxLength;
4728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        mPrefixOnly = false;
48cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        mMatchFlags1 = new boolean[maxLength];
49cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        mMatchFlags2 = new boolean[maxLength];
50cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    }
51cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
52cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    /**
5328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov     * Constructor for a matcher that only checks if one string is the exact prefix of the other
5428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov     */
5528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov    public NameDistance() {
5628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        mPrefixOnly = true;
5728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        mMaxLength = 0;
5828245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        mMatchFlags1 = mMatchFlags2 = null;
5928245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov    }
6028245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov
6128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov    /**
62cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov     * Computes a string distance between two normalized strings passed as byte arrays.
63cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov     */
64cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    public float getDistance(byte bytes1[], byte bytes2[]) {
65cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        byte[] array1, array2;
66cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
67cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        if (bytes1.length > bytes2.length) {
68cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            array2 = bytes1;
69cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            array1 = bytes2;
70cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        } else {
71cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            array2 = bytes2;
72cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            array1 = bytes1;
73cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
74cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
75cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        int length1 = array1.length;
76a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov        if (length1 >= MIN_EXACT_PREFIX_LENGTH) {
77a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov            boolean prefix = true;
78a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov            for (int i = 0; i < array1.length; i++) {
79a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov                if (array1[i] != array2[i]) {
80a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov                    prefix = false;
81a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov                    break;
82a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov                }
83a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov            }
84a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov            if (prefix) {
85a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov                return 1.0f;
86a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov            }
87a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov        }
88a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov
8928245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        if (mPrefixOnly) {
9028245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov            return 0;
9128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        }
9228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov
93cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        if (length1 > mMaxLength) {
94cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            length1 = mMaxLength;
95cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
96cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
97cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        int length2 = array2.length;
98cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        if (length2 > mMaxLength) {
99cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            length2 = mMaxLength;
100cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
101cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
102cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        Arrays.fill(mMatchFlags1, 0, length1, false);
103cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        Arrays.fill(mMatchFlags2, 0, length2, false);
104cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
105cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        int range = length2 / 2 - 1;
106cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        if (range < 0) {
107cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            range = 0;
108cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
109cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
110cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        int matches = 0;
111cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        for (int i = 0; i < length1; i++) {
112cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            byte c1 = array1[i];
113cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
114cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            int from = i - range;
115cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            if (from < 0) {
116cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                from = 0;
117cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            }
118cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
119cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            int to = i + range + 1;
120cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            if (to > length2) {
121cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                to = length2;
122cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            }
123cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
124cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            for (int j = from; j < to; j++) {
125cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                if (!mMatchFlags2[j] && c1 == array2[j]) {
126cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                    mMatchFlags1[i] = mMatchFlags2[j] = true;
127cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                    matches++;
128cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                    break;
129cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                }
130cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            }
131cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
132cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
13328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        if (matches == 0) {
134cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            return 0f;
135cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
136cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
137cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        int transpositions = 0;
138cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        int j = 0;
139cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        for (int i = 0; i < length1; i++) {
140cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            if (mMatchFlags1[i]) {
141cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                while (!mMatchFlags2[j]) {
142cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                    j++;
143cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                }
144cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                if (array1[i] != array2[j]) {
145cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                    transpositions++;
146cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                }
147cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov                j++;
148cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov            }
149cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
150cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
15128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        float m = matches;
1521c3bf258545d2598daa23959d9a042d82d738af7luan hongjun        float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3;
153cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov
15428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        if (jaro < WINKLER_BONUS_THRESHOLD) {
15528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov            return jaro;
156cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov        }
15728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov
15828245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        // Add Winkler bonus
15928245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        int prefix = 0;
16028245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        for (int i = 0; i < length1; i++) {
16128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov            if (bytes1[i] != bytes2[i]) {
16228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov                break;
16328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov            }
16428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov            prefix++;
16528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        }
16628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov
16728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov        return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro);
168cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov    }
169cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov}
170