1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16package com.android.providers.contacts.aggregation.util;
17
18import android.util.Log;
19import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
20import com.android.providers.contacts.util.Hex;
21
22import java.util.ArrayList;
23import java.util.Collections;
24import java.util.HashMap;
25import java.util.List;
26
27/**
28 * Logic for matching raw contacts' data.
29 */
30public class RawContactMatcher {
31    private static final String TAG = "ContactMatcher";
32
33    // Suggest to aggregate contacts if their match score is equal or greater than this threshold
34    public static final int SCORE_THRESHOLD_SUGGEST = 50;
35
36    public static final int SCORE_THRESHOLD_NO_NAME = 50;
37
38    // Automatically aggregate contacts if their match score is equal or greater than this threshold
39    public static final int SCORE_THRESHOLD_PRIMARY = 70;
40
41    // Automatically aggregate contacts if the match score is equal or greater than this threshold
42    // and there is a secondary match (phone number, email etc).
43    public static final int SCORE_THRESHOLD_SECONDARY = 50;
44
45    // Score for matching phone numbers
46    private static final int PHONE_MATCH_SCORE = 71;
47
48    // Score for matching email addresses
49    private static final int EMAIL_MATCH_SCORE = 71;
50
51    // Score for matching identity
52    private static final int IDENTITY_MATCH_SCORE = 71;
53
54    // Score for matching nickname
55    private static final int NICKNAME_MATCH_SCORE = 71;
56
57    // Maximum number of characters in a name to be considered by the matching algorithm.
58    private static final int MAX_MATCHED_NAME_LENGTH = 30;
59
60    // Scores a multiplied by this number to allow room for "fractional" scores
61    private static final int SCORE_SCALE = 1000;
62
63    public static final int MATCHING_ALGORITHM_EXACT = 0;
64    public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
65    public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
66
67    // Minimum edit distance between two names to be considered an approximate match
68    public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
69
70    // Minimum edit distance between two email ids to be considered an approximate match
71    public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
72
73    // Returned value when we found multiple matches and that was not allowed
74    public static final long MULTIPLE_MATCHES = -2;
75
76    /**
77     * Name matching scores: a matrix by name type vs. candidate lookup type.
78     * For example, if the name type is "full name" while we are looking for a
79     * "full name", the score may be 99. If we are looking for a "nickname" but
80     * find "first name", the score may be 50 (see specific scores defined
81     * below.)
82     * <p>
83     * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
84     * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
85     * match producing the score of 70.  The score may also be 0 if the similarity (distance)
86     * between the strings is below the threshold.
87     * <p>
88     * We use a string matching algorithm, which is particularly suited for
89     * name matching. See {@link NameDistance}.
90     */
91    private static int[] sMinScore =
92            new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
93    private static int[] sMaxScore =
94            new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
95
96    /*
97     * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
98     * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
99     * not!  They are useful in three-way aggregation cases when we have, for example, both
100     * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
101     * with the former rather than the latter.  This is why "reverse" matches have slightly lower
102     * scores than direct matches.
103     */
104    static {
105        setScoreRange(NameLookupType.NAME_EXACT,
106                NameLookupType.NAME_EXACT, 99, 99);
107        setScoreRange(NameLookupType.NAME_VARIANT,
108                NameLookupType.NAME_VARIANT, 90, 90);
109        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
110                NameLookupType.NAME_COLLATION_KEY, 50, 80);
111
112        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
113                NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
114        setScoreRange(NameLookupType.NAME_COLLATION_KEY,
115                NameLookupType.NICKNAME, 50, 60);
116
117        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
118                NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
119        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
120                NameLookupType.NAME_COLLATION_KEY, 50, 60);
121        setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
122                NameLookupType.NICKNAME, 50, 60);
123
124        setScoreRange(NameLookupType.NICKNAME,
125                NameLookupType.NICKNAME, 50, 60);
126        setScoreRange(NameLookupType.NICKNAME,
127                NameLookupType.NAME_COLLATION_KEY, 50, 60);
128        setScoreRange(NameLookupType.NICKNAME,
129                NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
130    }
131
132    /**
133     * Populates the cells of the score matrix and score span matrix
134     * corresponding to the {@code candidateNameType} and {@code nameType}.
135     */
136    private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom,
137            int scoreTo) {
138        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
139        sMinScore[index] = scoreFrom;
140        sMaxScore[index] = scoreTo;
141    }
142
143    /**
144     * Returns the lower range for the match score for the given {@code candidateNameType} and
145     * {@code nameType}.
146     */
147    private static int getMinScore(int candidateNameType, int nameType) {
148        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
149        return sMinScore[index];
150    }
151
152    /**
153     * Returns the upper range for the match score for the given {@code candidateNameType} and
154     * {@code nameType}.
155     */
156    private static int getMaxScore(int candidateNameType, int nameType) {
157        int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
158        return sMaxScore[index];
159    }
160
161    private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
162    private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
163    private int mScoreCount = 0;
164
165    private final NameDistance mNameDistanceConservative = new NameDistance();
166    private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
167
168    private MatchScore getMatchingScore(long rawContactId, long contactId, long accountId) {
169        MatchScore matchingScore = mScores.get(rawContactId);
170        if (matchingScore == null) {
171            if (mScoreList.size() > mScoreCount) {
172                matchingScore = mScoreList.get(mScoreCount);
173                matchingScore.reset(rawContactId, contactId, accountId);
174            } else {
175                matchingScore = new MatchScore(rawContactId, contactId, accountId);
176                mScoreList.add(matchingScore);
177            }
178            mScoreCount++;
179            mScores.put(rawContactId, matchingScore);
180        }
181        return matchingScore;
182    }
183
184    /**
185     * Checks if there is a match and updates the overall score for the
186     * specified contact for a discovered match. The new score is determined
187     * by the prior score, by the type of name we were looking for, the type
188     * of name we found and, if the match is approximate, the distance between the candidate and
189     * actual name.
190     */
191    public void matchName(long rawContactId, long contactId, long accountId, int
192            candidateNameType, String candidateName, int nameType, String name, int algorithm) {
193        int maxScore = getMaxScore(candidateNameType, nameType);
194        if (maxScore == 0) {
195            return;
196        }
197
198        if (candidateName.equals(name)) {
199            updatePrimaryScore(rawContactId, contactId, accountId, maxScore);
200            return;
201        }
202
203        if (algorithm == MATCHING_ALGORITHM_EXACT) {
204            return;
205        }
206
207        int minScore = getMinScore(candidateNameType, nameType);
208        if (minScore == maxScore) {
209            return;
210        }
211
212        final byte[] decodedCandidateName;
213        final byte[] decodedName;
214        try {
215            decodedCandidateName = Hex.decodeHex(candidateName);
216            decodedName = Hex.decodeHex(name);
217        } catch (RuntimeException e) {
218            // How could this happen??  See bug 6827136
219            Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
220            return;
221        }
222
223        NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
224                mNameDistanceConservative : mNameDistanceApproximate;
225
226        int score;
227        float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
228        boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
229                || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
230        float threshold = emailBased
231                ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
232                : APPROXIMATE_MATCH_THRESHOLD;
233        if (distance > threshold) {
234            score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
235        } else {
236            score = 0;
237        }
238
239        updatePrimaryScore(rawContactId, contactId, accountId, score);
240    }
241
242    public void matchIdentity(long rawContactId, long contactId, long accountId) {
243        updateSecondaryScore(rawContactId, contactId, accountId, IDENTITY_MATCH_SCORE);
244    }
245
246    public void updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId) {
247        updateSecondaryScore(rawContactId, contactId, accountId, PHONE_MATCH_SCORE);
248    }
249
250    public void updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId) {
251        updateSecondaryScore(rawContactId, contactId, accountId, EMAIL_MATCH_SCORE);
252    }
253
254    public void updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId) {
255        updateSecondaryScore(rawContactId, contactId, accountId, NICKNAME_MATCH_SCORE);
256    }
257
258    private void updatePrimaryScore(long rawContactId, long contactId, long accountId, int score) {
259        getMatchingScore(rawContactId, contactId, accountId).updatePrimaryScore(score);
260    }
261
262    private void updateSecondaryScore(long rawContactId, long contactId, long accountId,
263            int score) {
264        getMatchingScore(rawContactId, contactId, accountId).updateSecondaryScore(score);
265    }
266
267    public void keepIn(long rawContactId, long contactId, long accountId) {
268        getMatchingScore(rawContactId, contactId, accountId).keepIn();
269    }
270
271    public void keepOut(long rawContactId, long contactId, long accountId) {
272        getMatchingScore(rawContactId, contactId, accountId).keepOut();
273    }
274
275    public void clear() {
276        mScores.clear();
277        mScoreCount = 0;
278    }
279    /**
280     * Returns a list of IDs for raw contacts that are only matched on secondary data elements
281     * (phone number, email address, nickname, identity). We need to check if they are missing
282     * structured name or not to decide if they should be aggregated.
283     * <p>
284     * May return null.
285     */
286    public List<Long> prepareSecondaryMatchCandidates() {
287        ArrayList<Long> rawContactIds = null;
288
289        for (int i = 0; i < mScoreCount; i++) {
290            MatchScore score = mScoreList.get(i);
291            if (score.isKeepOut() ||  score.getPrimaryScore() > SCORE_THRESHOLD_PRIMARY){
292                continue;
293            }
294
295            if (score.getSecondaryScore() >= SCORE_THRESHOLD_PRIMARY) {
296                if (rawContactIds == null) {
297                    rawContactIds = new ArrayList<>();
298                }
299                rawContactIds.add(score.getRawContactId());
300            }
301            score.setPrimaryScore(0);
302        }
303        return rawContactIds;
304    }
305
306    /**
307     * Returns the list of raw contact Ids with the match score over threshold.
308     */
309    public List<MatchScore> pickBestMatches() {
310        final List<MatchScore> matches = new ArrayList<>();
311        for (int i = 0; i < mScoreCount; i++) {
312            MatchScore score = mScoreList.get(i);
313            if (score.isKeepOut()) {
314                continue;
315            }
316
317            if (score.isKeepIn()) {
318                matches.add(score);
319                continue;
320            }
321
322            if (score.getPrimaryScore() >= SCORE_THRESHOLD_PRIMARY ||
323                    (score.getPrimaryScore() == SCORE_THRESHOLD_NO_NAME &&
324                            score.getSecondaryScore() > SCORE_THRESHOLD_SECONDARY)) {
325                matches.add(score);
326            }
327        }
328        return matches;
329    }
330
331    /**
332     * Returns matches in the order of descending score.
333     */
334    public List<MatchScore> pickBestMatches(int threshold) {
335        int scaledThreshold = threshold * SCORE_SCALE;
336        List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
337        Collections.sort(matches);
338        int count = 0;
339        for (int i = 0; i < mScoreCount; i++) {
340            MatchScore matchScore = matches.get(i);
341            if (matchScore.getScore() >= scaledThreshold) {
342                count++;
343            } else {
344                break;
345            }
346        }
347
348        return matches.subList(0, count);
349    }
350
351    @Override
352    public String toString() {
353        return mScoreList.subList(0, mScoreCount).toString();
354    }
355
356    public void matchNoName(Long rawContactId, Long contactId, Long accountId) {
357        updatePrimaryScore(rawContactId, contactId, accountId, SCORE_THRESHOLD_NO_NAME);
358    }
359}
360