1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts.aggregation.util;
17 
18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
19 import com.android.providers.contacts.util.Hex;
20 
21 import android.util.Log;
22 
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.List;
27 
28 /**
29  * Logic for matching contacts' data and accumulating match scores.
30  */
31 public class ContactMatcher {
32     private static final String TAG = "ContactMatcher";
33 
34     // Best possible match score
35     public static final int MAX_SCORE = 100;
36 
37     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
38     public static final int SCORE_THRESHOLD_SUGGEST = 50;
39 
40     // Automatically aggregate contacts if their match score is equal or greater than this threshold
41     public static final int SCORE_THRESHOLD_PRIMARY = 70;
42 
43     // Automatically aggregate contacts if the match score is equal or greater than this threshold
44     // and there is a secondary match (phone number, email etc).
45     public static final int SCORE_THRESHOLD_SECONDARY = 50;
46 
47     // Score for missing data (as opposed to present data but a bad match)
48     private static final int NO_DATA_SCORE = -1;
49 
50     // Score for matching phone numbers
51     private static final int PHONE_MATCH_SCORE = 71;
52 
53     // Score for matching email addresses
54     private static final int EMAIL_MATCH_SCORE = 71;
55 
56     // Score for matching nickname
57     private static final int NICKNAME_MATCH_SCORE = 71;
58 
59     // Maximum number of characters in a name to be considered by the matching algorithm.
60     private static final int MAX_MATCHED_NAME_LENGTH = 30;
61 
62     // Scores a multiplied by this number to allow room for "fractional" scores
63     private static final int SCORE_SCALE = 1000;
64 
65     public static final int MATCHING_ALGORITHM_EXACT = 0;
66     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
67     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
68 
69     // Minimum edit distance between two names to be considered an approximate match
70     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
71 
72     // Minimum edit distance between two email ids to be considered an approximate match
73     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
74 
75     // Returned value when we found multiple matches and that was not allowed
76     public static final long MULTIPLE_MATCHES = -2;
77 
78     /**
79      * Name matching scores: a matrix by name type vs. candidate lookup type.
80      * For example, if the name type is "full name" while we are looking for a
81      * "full name", the score may be 99. If we are looking for a "nickname" but
82      * find "first name", the score may be 50 (see specific scores defined
83      * below.)
84      * <p>
85      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
86      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
87      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
88      * between the strings is below the threshold.
89      * <p>
90      * We use a string matching algorithm, which is particularly suited for
91      * name matching. See {@link NameDistance}.
92      */
93     private static int[] sMinScore =
94             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
95     private static int[] sMaxScore =
96             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
97 
98     /*
99      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
100      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
101      * not!  They are useful in three-way aggregation cases when we have, for example, both
102      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
103      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
104      * scores than direct matches.
105      */
106     static {
setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)107         setScoreRange(NameLookupType.NAME_EXACT,
108                 NameLookupType.NAME_EXACT, 99, 99);
setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)109         setScoreRange(NameLookupType.NAME_VARIANT,
110                 NameLookupType.NAME_VARIANT, 90, 90);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)111         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
112                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
113 
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)114         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
115                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)116         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
117                 NameLookupType.NICKNAME, 50, 60);
118 
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)119         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
120                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)121         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
122                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)123         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
124                 NameLookupType.NICKNAME, 50, 60);
125 
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)126         setScoreRange(NameLookupType.NICKNAME,
127                 NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)128         setScoreRange(NameLookupType.NICKNAME,
129                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)130         setScoreRange(NameLookupType.NICKNAME,
131                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
132     }
133 
134     /**
135      * Populates the cells of the score matrix and score span matrix
136      * corresponding to the {@code candidateNameType} and {@code nameType}.
137      */
setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)138     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
139         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
140         sMinScore[index] = scoreFrom;
141         sMaxScore[index] = scoreTo;
142     }
143 
144     /**
145      * Returns the lower range for the match score for the given {@code candidateNameType} and
146      * {@code nameType}.
147      */
getMinScore(int candidateNameType, int nameType)148     private static int getMinScore(int candidateNameType, int nameType) {
149         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
150         return sMinScore[index];
151     }
152 
153     /**
154      * Returns the upper range for the match score for the given {@code candidateNameType} and
155      * {@code nameType}.
156      */
getMaxScore(int candidateNameType, int nameType)157     private static int getMaxScore(int candidateNameType, int nameType) {
158         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
159         return sMaxScore[index];
160     }
161 
162     /**
163      * Captures the max score and match count for a specific contact.  Used in an
164      * contactId - MatchScore map.
165      */
166     public static class MatchScore implements Comparable<MatchScore> {
167         private long mContactId;
168         private boolean mKeepIn;
169         private boolean mKeepOut;
170         private int mPrimaryScore;
171         private int mSecondaryScore;
172         private int mMatchCount;
173 
MatchScore(long contactId)174         public MatchScore(long contactId) {
175             this.mContactId = contactId;
176         }
177 
reset(long contactId)178         public void reset(long contactId) {
179             this.mContactId = contactId;
180             mKeepIn = false;
181             mKeepOut = false;
182             mPrimaryScore = 0;
183             mSecondaryScore = 0;
184             mMatchCount = 0;
185         }
186 
getContactId()187         public long getContactId() {
188             return mContactId;
189         }
190 
updatePrimaryScore(int score)191         public void updatePrimaryScore(int score) {
192             if (score > mPrimaryScore) {
193                 mPrimaryScore = score;
194             }
195             mMatchCount++;
196         }
197 
updateSecondaryScore(int score)198         public void updateSecondaryScore(int score) {
199             if (score > mSecondaryScore) {
200                 mSecondaryScore = score;
201             }
202             mMatchCount++;
203         }
204 
keepIn()205         public void keepIn() {
206             mKeepIn = true;
207         }
208 
keepOut()209         public void keepOut() {
210             mKeepOut = true;
211         }
212 
getScore()213         public int getScore() {
214             if (mKeepOut) {
215                 return 0;
216             }
217 
218             if (mKeepIn) {
219                 return MAX_SCORE;
220             }
221 
222             int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
223 
224             // Ensure that of two contacts with the same match score the one with more matching
225             // data elements wins.
226             return score * SCORE_SCALE + mMatchCount;
227         }
228 
229         /**
230          * Descending order of match score.
231          */
232         @Override
compareTo(MatchScore another)233         public int compareTo(MatchScore another) {
234             return another.getScore() - getScore();
235         }
236 
237         @Override
toString()238         public String toString() {
239             return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
240                     + ")";
241         }
242     }
243 
244     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
245     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
246     private int mScoreCount = 0;
247 
248     private final NameDistance mNameDistanceConservative = new NameDistance();
249     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
250 
getMatchingScore(long contactId)251     private MatchScore getMatchingScore(long contactId) {
252         MatchScore matchingScore = mScores.get(contactId);
253         if (matchingScore == null) {
254             if (mScoreList.size() > mScoreCount) {
255                 matchingScore = mScoreList.get(mScoreCount);
256                 matchingScore.reset(contactId);
257             } else {
258                 matchingScore = new MatchScore(contactId);
259                 mScoreList.add(matchingScore);
260             }
261             mScoreCount++;
262             mScores.put(contactId, matchingScore);
263         }
264         return matchingScore;
265     }
266 
267     /**
268      * Marks the contact as a full match, because we found an Identity match
269      */
matchIdentity(long contactId)270     public void matchIdentity(long contactId) {
271         updatePrimaryScore(contactId, MAX_SCORE);
272     }
273 
274     /**
275      * Checks if there is a match and updates the overall score for the
276      * specified contact for a discovered match. The new score is determined
277      * by the prior score, by the type of name we were looking for, the type
278      * of name we found and, if the match is approximate, the distance between the candidate and
279      * actual name.
280      */
matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)281     public void matchName(long contactId, int candidateNameType, String candidateName,
282             int nameType, String name, int algorithm) {
283         int maxScore = getMaxScore(candidateNameType, nameType);
284         if (maxScore == 0) {
285             return;
286         }
287 
288         if (candidateName.equals(name)) {
289             updatePrimaryScore(contactId, maxScore);
290             return;
291         }
292 
293         if (algorithm == MATCHING_ALGORITHM_EXACT) {
294             return;
295         }
296 
297         int minScore = getMinScore(candidateNameType, nameType);
298         if (minScore == maxScore) {
299             return;
300         }
301 
302         final byte[] decodedCandidateName;
303         final byte[] decodedName;
304         try {
305             decodedCandidateName = Hex.decodeHex(candidateName);
306             decodedName = Hex.decodeHex(name);
307         } catch (RuntimeException e) {
308             // How could this happen??  See bug 6827136
309             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
310             return;
311         }
312 
313         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
314                 mNameDistanceConservative : mNameDistanceApproximate;
315 
316         int score;
317         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
318         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
319                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
320         float threshold = emailBased
321                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
322                 : APPROXIMATE_MATCH_THRESHOLD;
323         if (distance > threshold) {
324             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
325         } else {
326             score = 0;
327         }
328 
329         updatePrimaryScore(contactId, score);
330     }
331 
updateScoreWithPhoneNumberMatch(long contactId)332     public void updateScoreWithPhoneNumberMatch(long contactId) {
333         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
334     }
335 
updateScoreWithEmailMatch(long contactId)336     public void updateScoreWithEmailMatch(long contactId) {
337         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
338     }
339 
updateScoreWithNicknameMatch(long contactId)340     public void updateScoreWithNicknameMatch(long contactId) {
341         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
342     }
343 
updatePrimaryScore(long contactId, int score)344     private void updatePrimaryScore(long contactId, int score) {
345         getMatchingScore(contactId).updatePrimaryScore(score);
346     }
347 
updateSecondaryScore(long contactId, int score)348     private void updateSecondaryScore(long contactId, int score) {
349         getMatchingScore(contactId).updateSecondaryScore(score);
350     }
351 
keepIn(long contactId)352     public void keepIn(long contactId) {
353         getMatchingScore(contactId).keepIn();
354     }
355 
keepOut(long contactId)356     public void keepOut(long contactId) {
357         getMatchingScore(contactId).keepOut();
358     }
359 
clear()360     public void clear() {
361         mScores.clear();
362         mScoreCount = 0;
363     }
364 
365     /**
366      * Returns a list of IDs for contacts that are matched on secondary data elements
367      * (phone number, email address, nickname). We still need to obtain the approximate
368      * primary score for those contacts to determine if any of them should be aggregated.
369      * <p>
370      * May return null.
371      */
prepareSecondaryMatchCandidates(int threshold)372     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
373         ArrayList<Long> contactIds = null;
374 
375         for (int i = 0; i < mScoreCount; i++) {
376             MatchScore score = mScoreList.get(i);
377             if (score.mKeepOut) {
378                 continue;
379             }
380 
381             int s = score.mSecondaryScore;
382             if (s >= threshold) {
383                 if (contactIds == null) {
384                     contactIds = new ArrayList<Long>();
385                 }
386                 contactIds.add(score.mContactId);
387             }
388             score.mPrimaryScore = NO_DATA_SCORE;
389         }
390         return contactIds;
391     }
392 
393     /**
394      * Returns the contactId with the best match score over the specified threshold or -1
395      * if no such contact is found.  If multiple contacts are found, and
396      * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
397      * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
398      */
pickBestMatch(int threshold, boolean allowMultipleMatches)399     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
400         long contactId = -1;
401         int maxScore = 0;
402         for (int i = 0; i < mScoreCount; i++) {
403             MatchScore score = mScoreList.get(i);
404             if (score.mKeepOut) {
405                 continue;
406             }
407 
408             if (score.mKeepIn) {
409                 return score.mContactId;
410             }
411 
412             int s = score.mPrimaryScore;
413             if (s == NO_DATA_SCORE) {
414                 s = score.mSecondaryScore;
415             }
416 
417             if (s >= threshold) {
418                 if (contactId != -1 && !allowMultipleMatches) {
419                     return MULTIPLE_MATCHES;
420                 }
421                 // In order to make it stable, let's jut pick the one with the lowest ID
422                 // if multiple candidates are found.
423                 if ((s > maxScore) || ((s == maxScore) && (contactId > score.mContactId))) {
424                     contactId = score.mContactId;
425                     maxScore = s;
426                 }
427             }
428         }
429         return contactId;
430     }
431 
432     /**
433      * Returns matches in the order of descending score.
434      */
pickBestMatches(int threshold)435     public List<MatchScore> pickBestMatches(int threshold) {
436         int scaledThreshold = threshold * SCORE_SCALE;
437         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
438         Collections.sort(matches);
439         int count = 0;
440         for (int i = 0; i < mScoreCount; i++) {
441             MatchScore matchScore = matches.get(i);
442             if (matchScore.getScore() >= scaledThreshold) {
443                 count++;
444             } else {
445                 break;
446             }
447         }
448 
449         return matches.subList(0, count);
450     }
451 
452     @Override
toString()453     public String toString() {
454         return mScoreList.subList(0, mScoreCount).toString();
455     }
456 }
457