1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16package com.android.providers.contacts.aggregation.util;
17
18import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
19import com.android.providers.contacts.util.Hex;
20
21import android.util.Log;
22
23import java.util.ArrayList;
24import java.util.Collections;
25import java.util.HashMap;
26import java.util.List;
27
28/**
29 * Logic for matching contacts' data and accumulating match scores.
30 */
31public class ContactMatcher {
32    private static final String TAG = "ContactMatcher";
33
34    // Best possible match score
35    public static final int MAX_SCORE = 100;
36
37    // Suggest to aggregate contacts if their match score is equal or greater than this threshold
38    public static final int SCORE_THRESHOLD_SUGGEST = 50;
39
40    // Automatically aggregate contacts if their match score is equal or greater than this threshold
41    public static final int SCORE_THRESHOLD_PRIMARY = 70;
42
43    // Automatically aggregate contacts if the match score is equal or greater than this threshold
44    // and there is a secondary match (phone number, email etc).
45    public static final int SCORE_THRESHOLD_SECONDARY = 50;
46
47    // Score for missing data (as opposed to present data but a bad match)
48    private static final int NO_DATA_SCORE = -1;
49
50    // Score for matching phone numbers
51    private static final int PHONE_MATCH_SCORE = 71;
52
53    // Score for matching email addresses
54    private static final int EMAIL_MATCH_SCORE = 71;
55
56    // Score for matching nickname
57    private static final int NICKNAME_MATCH_SCORE = 71;
58
59    // Maximum number of characters in a name to be considered by the matching algorithm.
60    private static final int MAX_MATCHED_NAME_LENGTH = 30;
61
62    // Scores a multiplied by this number to allow room for "fractional" scores
63    private static final int SCORE_SCALE = 1000;
64
65    public static final int MATCHING_ALGORITHM_EXACT = 0;
66    public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
67    public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
68
69    // Minimum edit distance between two names to be considered an approximate match
70    public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
71
72    // Minimum edit distance between two email ids to be considered an approximate match
73    public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
74
75    // Returned value when we found multiple matches and that was not allowed
76    public static final long MULTIPLE_MATCHES = -2;
77
78    /**
79     * Name matching scores: a matrix by name type vs. candidate lookup type.
80     * For example, if the name type is "full name" while we are looking for a
81     * "full name", the score may be 99. If we are looking for a "nickname" but
82     * find "first name", the score may be 50 (see specific scores defined
83     * below.)
84     * <p>
85     * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
86     * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
87     * match producing the score of 70.  The score may also be 0 if the similarity (distance)
88     * between the strings is below the threshold.
89     * <p>
90     * We use a string matching algorithm, which is particularly suited for
91     * name matching. See {@link NameDistance}.
92     */
93    private static int[] sMinScore =
94            new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
95    private static int[] sMaxScore =
96            new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
97
98    /*
99     * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
100     * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
101     * not!  They are useful in three-way aggregation cases when we have, for example, both
102     * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
103     * with the former rather than the latter.  This is why "reverse" matches have slightly lower
104     * scores than direct matches.
105     */
106    static {
107        setScoreRange(NameLookupType.NAME_EXACT,
108                NameLookupType.NAME_EXACT, 99, 99);
109        setScoreRange(NameLookupType.NAME_VARIANT,
110                NameLookupType.NAME_VARIANT, 90, 90);
111        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
112                NameLookupType.NAME_COLLATION_KEY, 50, 80);
113
114        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
115                NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
116        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
117                NameLookupType.NICKNAME, 50, 60);
118
119        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
120                NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
121        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
122                NameLookupType.NAME_COLLATION_KEY, 50, 60);
123        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
124                NameLookupType.NICKNAME, 50, 60);
125
126        setScoreRange(NameLookupType.NICKNAME,
127                NameLookupType.NICKNAME, 50, 60);
128        setScoreRange(NameLookupType.NICKNAME,
129                NameLookupType.NAME_COLLATION_KEY, 50, 60);
130        setScoreRange(NameLookupType.NICKNAME,
131                NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
132    }
133
134    /**
135     * Populates the cells of the score matrix and score span matrix
136     * corresponding to the {@code candidateNameType} and {@code nameType}.
137     */
138    private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
139        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
140        sMinScore[index] = scoreFrom;
141        sMaxScore[index] = scoreTo;
142    }
143
144    /**
145     * Returns the lower range for the match score for the given {@code candidateNameType} and
146     * {@code nameType}.
147     */
148    private static int getMinScore(int candidateNameType, int nameType) {
149        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
150        return sMinScore[index];
151    }
152
153    /**
154     * Returns the upper range for the match score for the given {@code candidateNameType} and
155     * {@code nameType}.
156     */
157    private static int getMaxScore(int candidateNameType, int nameType) {
158        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
159        return sMaxScore[index];
160    }
161
162    /**
163     * Captures the max score and match count for a specific contact.  Used in an
164     * contactId - MatchScore map.
165     */
166    public static class MatchScore implements Comparable<MatchScore> {
167        private long mContactId;
168        private boolean mKeepIn;
169        private boolean mKeepOut;
170        private int mPrimaryScore;
171        private int mSecondaryScore;
172        private int mMatchCount;
173
174        public MatchScore(long contactId) {
175            this.mContactId = contactId;
176        }
177
178        public void reset(long contactId) {
179            this.mContactId = contactId;
180            mKeepIn = false;
181            mKeepOut = false;
182            mPrimaryScore = 0;
183            mSecondaryScore = 0;
184            mMatchCount = 0;
185        }
186
187        public long getContactId() {
188            return mContactId;
189        }
190
191        public void updatePrimaryScore(int score) {
192            if (score > mPrimaryScore) {
193                mPrimaryScore = score;
194            }
195            mMatchCount++;
196        }
197
198        public void updateSecondaryScore(int score) {
199            if (score > mSecondaryScore) {
200                mSecondaryScore = score;
201            }
202            mMatchCount++;
203        }
204
205        public void keepIn() {
206            mKeepIn = true;
207        }
208
209        public void keepOut() {
210            mKeepOut = true;
211        }
212
213        public int getScore() {
214            if (mKeepOut) {
215                return 0;
216            }
217
218            if (mKeepIn) {
219                return MAX_SCORE;
220            }
221
222            int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
223
224            // Ensure that of two contacts with the same match score the one with more matching
225            // data elements wins.
226            return score * SCORE_SCALE + mMatchCount;
227        }
228
229        /**
230         * Descending order of match score.
231         */
232        @Override
233        public int compareTo(MatchScore another) {
234            return another.getScore() - getScore();
235        }
236
237        @Override
238        public String toString() {
239            return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
240                    + ")";
241        }
242    }
243
244    private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
245    private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
246    private int mScoreCount = 0;
247
248    private final NameDistance mNameDistanceConservative = new NameDistance();
249    private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
250
251    private MatchScore getMatchingScore(long contactId) {
252        MatchScore matchingScore = mScores.get(contactId);
253        if (matchingScore == null) {
254            if (mScoreList.size() > mScoreCount) {
255                matchingScore = mScoreList.get(mScoreCount);
256                matchingScore.reset(contactId);
257            } else {
258                matchingScore = new MatchScore(contactId);
259                mScoreList.add(matchingScore);
260            }
261            mScoreCount++;
262            mScores.put(contactId, matchingScore);
263        }
264        return matchingScore;
265    }
266
267    /**
268     * Marks the contact as a full match, because we found an Identity match
269     */
270    public void matchIdentity(long contactId) {
271        updatePrimaryScore(contactId, MAX_SCORE);
272    }
273
274    /**
275     * Checks if there is a match and updates the overall score for the
276     * specified contact for a discovered match. The new score is determined
277     * by the prior score, by the type of name we were looking for, the type
278     * of name we found and, if the match is approximate, the distance between the candidate and
279     * actual name.
280     */
281    public void matchName(long contactId, int candidateNameType, String candidateName,
282            int nameType, String name, int algorithm) {
283        int maxScore = getMaxScore(candidateNameType, nameType);
284        if (maxScore == 0) {
285            return;
286        }
287
288        if (candidateName.equals(name)) {
289            updatePrimaryScore(contactId, maxScore);
290            return;
291        }
292
293        if (algorithm == MATCHING_ALGORITHM_EXACT) {
294            return;
295        }
296
297        int minScore = getMinScore(candidateNameType, nameType);
298        if (minScore == maxScore) {
299            return;
300        }
301
302        final byte[] decodedCandidateName;
303        final byte[] decodedName;
304        try {
305            decodedCandidateName = Hex.decodeHex(candidateName);
306            decodedName = Hex.decodeHex(name);
307        } catch (RuntimeException e) {
308            // How could this happen??  See bug 6827136
309            Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
310            return;
311        }
312
313        NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
314                mNameDistanceConservative : mNameDistanceApproximate;
315
316        int score;
317        float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
318        boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
319                || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
320        float threshold = emailBased
321                ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
322                : APPROXIMATE_MATCH_THRESHOLD;
323        if (distance > threshold) {
324            score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
325        } else {
326            score = 0;
327        }
328
329        updatePrimaryScore(contactId, score);
330    }
331
332    public void updateScoreWithPhoneNumberMatch(long contactId) {
333        updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
334    }
335
336    public void updateScoreWithEmailMatch(long contactId) {
337        updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
338    }
339
340    public void updateScoreWithNicknameMatch(long contactId) {
341        updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
342    }
343
344    private void updatePrimaryScore(long contactId, int score) {
345        getMatchingScore(contactId).updatePrimaryScore(score);
346    }
347
348    private void updateSecondaryScore(long contactId, int score) {
349        getMatchingScore(contactId).updateSecondaryScore(score);
350    }
351
352    public void keepIn(long contactId) {
353        getMatchingScore(contactId).keepIn();
354    }
355
356    public void keepOut(long contactId) {
357        getMatchingScore(contactId).keepOut();
358    }
359
360    public void clear() {
361        mScores.clear();
362        mScoreCount = 0;
363    }
364
365    /**
366     * Returns a list of IDs for contacts that are matched on secondary data elements
367     * (phone number, email address, nickname). We still need to obtain the approximate
368     * primary score for those contacts to determine if any of them should be aggregated.
369     * <p>
370     * May return null.
371     */
372    public List<Long> prepareSecondaryMatchCandidates(int threshold) {
373        ArrayList<Long> contactIds = null;
374
375        for (int i = 0; i < mScoreCount; i++) {
376            MatchScore score = mScoreList.get(i);
377            if (score.mKeepOut) {
378                continue;
379            }
380
381            int s = score.mSecondaryScore;
382            if (s >= threshold) {
383                if (contactIds == null) {
384                    contactIds = new ArrayList<Long>();
385                }
386                contactIds.add(score.mContactId);
387            }
388            score.mPrimaryScore = NO_DATA_SCORE;
389        }
390        return contactIds;
391    }
392
393    /**
394     * Returns the contactId with the best match score over the specified threshold or -1
395     * if no such contact is found.  If multiple contacts are found, and
396     * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
397     * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
398     */
399    public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
400        long contactId = -1;
401        int maxScore = 0;
402        for (int i = 0; i < mScoreCount; i++) {
403            MatchScore score = mScoreList.get(i);
404            if (score.mKeepOut) {
405                continue;
406            }
407
408            if (score.mKeepIn) {
409                return score.mContactId;
410            }
411
412            int s = score.mPrimaryScore;
413            if (s == NO_DATA_SCORE) {
414                s = score.mSecondaryScore;
415            }
416
417            if (s >= threshold) {
418                if (contactId != -1 && !allowMultipleMatches) {
419                    return MULTIPLE_MATCHES;
420                }
421                // In order to make it stable, let's jut pick the one with the lowest ID
422                // if multiple candidates are found.
423                if ((s > maxScore) || ((s == maxScore) && (contactId > score.mContactId))) {
424                    contactId = score.mContactId;
425                    maxScore = s;
426                }
427            }
428        }
429        return contactId;
430    }
431
432    /**
433     * Returns matches in the order of descending score.
434     */
435    public List<MatchScore> pickBestMatches(int threshold) {
436        int scaledThreshold = threshold * SCORE_SCALE;
437        List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
438        Collections.sort(matches);
439        int count = 0;
440        for (int i = 0; i < mScoreCount; i++) {
441            MatchScore matchScore = matches.get(i);
442            if (matchScore.getScore() >= scaledThreshold) {
443                count++;
444            } else {
445                break;
446            }
447        }
448
449        return matches.subList(0, count);
450    }
451
452    @Override
453    public String toString() {
454        return mScoreList.subList(0, mScoreCount).toString();
455    }
456}
457