/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License */ package com.android.providers.contacts; import java.util.Arrays; /** * A string distance calculator, particularly suited for name matching. *

* A detailed discussion of the topic of record linkage in general and name matching * in particular can be found in this article: *

* Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions". * Research Report Series, RRS. *
*/ public class NameDistance { private static final float WINKLER_BONUS_THRESHOLD = 0.7f; private static final int MIN_EXACT_PREFIX_LENGTH = 3; private final int mMaxLength; private final boolean mPrefixOnly; private final boolean[] mMatchFlags1; private final boolean[] mMatchFlags2; /** * Constructor. * * @param maxLength byte arrays are truncated if longer than this number */ public NameDistance(int maxLength) { mMaxLength = maxLength; mPrefixOnly = false; mMatchFlags1 = new boolean[maxLength]; mMatchFlags2 = new boolean[maxLength]; } /** * Constructor for a matcher that only checks if one string is the exact prefix of the other */ public NameDistance() { mPrefixOnly = true; mMaxLength = 0; mMatchFlags1 = mMatchFlags2 = null; } /** * Computes a string distance between two normalized strings passed as byte arrays. */ public float getDistance(byte bytes1[], byte bytes2[]) { byte[] array1, array2; if (bytes1.length > bytes2.length) { array2 = bytes1; array1 = bytes2; } else { array2 = bytes2; array1 = bytes1; } int length1 = array1.length; if (length1 >= MIN_EXACT_PREFIX_LENGTH) { boolean prefix = true; for (int i = 0; i < array1.length; i++) { if (array1[i] != array2[i]) { prefix = false; break; } } if (prefix) { return 1.0f; } } if (mPrefixOnly) { return 0; } if (length1 > mMaxLength) { length1 = mMaxLength; } int length2 = array2.length; if (length2 > mMaxLength) { length2 = mMaxLength; } Arrays.fill(mMatchFlags1, 0, length1, false); Arrays.fill(mMatchFlags2, 0, length2, false); int range = length2 / 2 - 1; if (range < 0) { range = 0; } int matches = 0; for (int i = 0; i < length1; i++) { byte c1 = array1[i]; int from = i - range; if (from < 0) { from = 0; } int to = i + range + 1; if (to > length2) { to = length2; } for (int j = from; j < to; j++) { if (!mMatchFlags2[j] && c1 == array2[j]) { mMatchFlags1[i] = mMatchFlags2[j] = true; matches++; break; } } } if (matches == 0) { return 0f; } int transpositions = 0; int j = 0; for (int i = 0; i < length1; i++) { if (mMatchFlags1[i]) { while (!mMatchFlags2[j]) { j++; } if (array1[i] != array2[j]) { transpositions++; } j++; } } float m = matches; float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2)) / m)) / 3; if (jaro < WINKLER_BONUS_THRESHOLD) { return jaro; } // Add Winkler bonus int prefix = 0; for (int i = 0; i < length1; i++) { if (bytes1[i] != bytes2[i]) { break; } prefix++; } return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro); } }