1cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov/* 2cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Copyright (C) 2009 The Android Open Source Project 3cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * 4cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Licensed under the Apache License, Version 2.0 (the "License"); 5cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * you may not use this file except in compliance with the License. 6cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * You may obtain a copy of the License at 7cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * 8cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * http://www.apache.org/licenses/LICENSE-2.0 9cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * 10cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Unless required by applicable law or agreed to in writing, software 11cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * distributed under the License is distributed on an "AS IS" BASIS, 12cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * See the License for the specific language governing permissions and 14cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * limitations under the License 15cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov */ 1681567f4a0f7c9c338506bd82f4d33e83c2ccf159Makoto Onukipackage com.android.providers.contacts.aggregation.util; 17cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 18cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikovimport java.util.Arrays; 19cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 20cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov/** 21cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * A string distance calculator, particularly suited for name matching. 2228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov* <p> 2328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * A detailed discussion of the topic of record linkage in general and name matching 2428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * in particular can be found in this article: 2528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * <blockquote> 2628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions". 2728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * Research Report Series, RRS. 2828245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * </blockquote> 29cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov */ 30a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikovpublic class NameDistance { 31cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 3228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov private static final float WINKLER_BONUS_THRESHOLD = 0.7f; 33a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov private static final int MIN_EXACT_PREFIX_LENGTH = 3; 34cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 35cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov private final int mMaxLength; 3628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov private final boolean mPrefixOnly; 37cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov private final boolean[] mMatchFlags1; 38cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov private final boolean[] mMatchFlags2; 39cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 40cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov /** 41cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Constructor. 42cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * 4328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * @param maxLength byte arrays are truncated if longer than this number 44cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov */ 45a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov public NameDistance(int maxLength) { 46cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov mMaxLength = maxLength; 4728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov mPrefixOnly = false; 48cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov mMatchFlags1 = new boolean[maxLength]; 49cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov mMatchFlags2 = new boolean[maxLength]; 50cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 51cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 52cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov /** 5328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov * Constructor for a matcher that only checks if one string is the exact prefix of the other 5428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov */ 5528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov public NameDistance() { 5628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov mPrefixOnly = true; 5728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov mMaxLength = 0; 5828245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov mMatchFlags1 = mMatchFlags2 = null; 5928245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov } 6028245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov 6128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov /** 62cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov * Computes a string distance between two normalized strings passed as byte arrays. 63cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov */ 64cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov public float getDistance(byte bytes1[], byte bytes2[]) { 65cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov byte[] array1, array2; 66cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 67cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (bytes1.length > bytes2.length) { 68cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov array2 = bytes1; 69cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov array1 = bytes2; 70cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } else { 71cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov array2 = bytes2; 72cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov array1 = bytes1; 73cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 74cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 75cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int length1 = array1.length; 76a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov if (length1 >= MIN_EXACT_PREFIX_LENGTH) { 77a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov boolean prefix = true; 78a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov for (int i = 0; i < array1.length; i++) { 79a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov if (array1[i] != array2[i]) { 80a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov prefix = false; 81a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov break; 82a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov } 83a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov } 84a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov if (prefix) { 85a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov return 1.0f; 86a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov } 87a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov } 88a276d23a70bdb8fdc7c8340a12a64e90696da115Dmitri Plotnikov 8928245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov if (mPrefixOnly) { 9028245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov return 0; 9128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov } 9228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov 93cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (length1 > mMaxLength) { 94cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov length1 = mMaxLength; 95cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 96cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 97cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int length2 = array2.length; 98cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (length2 > mMaxLength) { 99cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov length2 = mMaxLength; 100cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 101cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 102cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov Arrays.fill(mMatchFlags1, 0, length1, false); 103cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov Arrays.fill(mMatchFlags2, 0, length2, false); 104cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 105cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int range = length2 / 2 - 1; 106cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (range < 0) { 107cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov range = 0; 108cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 109cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 110cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int matches = 0; 111cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov for (int i = 0; i < length1; i++) { 112cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov byte c1 = array1[i]; 113cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 114cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int from = i - range; 115cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (from < 0) { 116cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov from = 0; 117cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 118cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 119cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int to = i + range + 1; 120cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (to > length2) { 121cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov to = length2; 122cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 123cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 124cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov for (int j = from; j < to; j++) { 125cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (!mMatchFlags2[j] && c1 == array2[j]) { 126cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov mMatchFlags1[i] = mMatchFlags2[j] = true; 127cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov matches++; 128cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov break; 129cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 130cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 131cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 132cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 13328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov if (matches == 0) { 134cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov return 0f; 135cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 136cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 137cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int transpositions = 0; 138cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov int j = 0; 139cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov for (int i = 0; i < length1; i++) { 140cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (mMatchFlags1[i]) { 141cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov while (!mMatchFlags2[j]) { 142cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov j++; 143cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 144cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov if (array1[i] != array2[j]) { 145cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov transpositions++; 146cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 147cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov j++; 148cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 149cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 150cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 15128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov float m = matches; 1521c3bf258545d2598daa23959d9a042d82d738af7luan hongjun float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3; 153cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov 15428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov if (jaro < WINKLER_BONUS_THRESHOLD) { 15528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov return jaro; 156cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 15728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov 15828245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov // Add Winkler bonus 15928245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov int prefix = 0; 16028245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov for (int i = 0; i < length1; i++) { 16128245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov if (bytes1[i] != bytes2[i]) { 16228245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov break; 16328245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov } 16428245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov prefix++; 16528245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov } 16628245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov 16728245e1690abbab18b3224e4164a738f6e64d6c0Dmitri Plotnikov return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro); 168cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov } 169cceca0d48bc5b0c7fc20b987439add82f734a8f5Dmitri Plotnikov} 170