/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License */ package com.android.providers.contacts; import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; /** * Logic for matching contacts' data and accumulating match scores. */ public class ContactMatcher { // Best possible match score public static final int MAX_SCORE = 100; // Suggest to aggregate contacts if their match score is equal or greater than this threshold public static final int SCORE_THRESHOLD_SUGGEST = 50; // Automatically aggregate contacts if their match score is equal or greater than this threshold public static final int SCORE_THRESHOLD_PRIMARY = 70; // Automatically aggregate contacts if the match score is equal or greater than this threshold // and there is a secondary match (phone number, email etc). public static final int SCORE_THRESHOLD_SECONDARY = 50; // Score for missing data (as opposed to present data but a bad match) private static final int NO_DATA_SCORE = -1; // Score for matching phone numbers private static final int PHONE_MATCH_SCORE = 71; // Score for matching email addresses private static final int EMAIL_MATCH_SCORE = 71; // Score for matching nickname private static final int NICKNAME_MATCH_SCORE = 71; // Maximum number of characters in a name to be considered by the matching algorithm. private static final int MAX_MATCHED_NAME_LENGTH = 30; // Scores a multiplied by this number to allow room for "fractional" scores private static final int SCORE_SCALE = 1000; public static final int MATCHING_ALGORITHM_EXACT = 0; public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; // Minimum edit distance between two names to be considered an approximate match public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; // Minimum edit distance between two email ids to be considered an approximate match public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; /** * Name matching scores: a matrix by name type vs. candidate lookup type. * For example, if the name type is "full name" while we are looking for a * "full name", the score may be 99. If we are looking for a "nickname" but * find "first name", the score may be 50 (see specific scores defined * below.) *
* For approximate matching, we have a range of scores, let's say 40-70. Depending one how * similar the two strings are, the score will be somewhere between 40 and 70, with the exact * match producing the score of 70. The score may also be 0 if the similarity (distance) * between the strings is below the threshold. *
* We use a string matching algorithm, which is particularly suited for
* name matching. See {@link NameDistance}.
*/
private static int[] sMinScore =
new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
private static int[] sMaxScore =
new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
/*
* Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
* {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
* not! They are useful in three-way aggregation cases when we have, for example, both
* John Smith and Smith John. A third contact with the name John Smith should be aggregated
* with the former rather than the latter. This is why "reverse" matches have slightly lower
* scores than direct matches.
*/
static {
setScoreRange(NameLookupType.NAME_EXACT,
NameLookupType.NAME_EXACT, 99, 99);
setScoreRange(NameLookupType.NAME_VARIANT,
NameLookupType.NAME_VARIANT, 90, 90);
setScoreRange(NameLookupType.NAME_COLLATION_KEY,
NameLookupType.NAME_COLLATION_KEY, 50, 80);
setScoreRange(NameLookupType.NAME_COLLATION_KEY,
NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
setScoreRange(NameLookupType.NAME_COLLATION_KEY,
NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME,
NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME,
NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.NICKNAME,
NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
}
/**
* Populates the cells of the score matrix and score span matrix
* corresponding to the {@code candidateNameType} and {@code nameType}.
*/
private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
sMinScore[index] = scoreFrom;
sMaxScore[index] = scoreTo;
}
/**
* Returns the lower range for the match score for the given {@code candidateNameType} and
* {@code nameType}.
*/
private static int getMinScore(int candidateNameType, int nameType) {
int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
return sMinScore[index];
}
/**
* Returns the upper range for the match score for the given {@code candidateNameType} and
* {@code nameType}.
*/
private static int getMaxScore(int candidateNameType, int nameType) {
int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
return sMaxScore[index];
}
/**
* Captures the max score and match count for a specific contact. Used in an
* contactId - MatchScore map.
*/
public static class MatchScore implements Comparable
* May return null.
*/
public List