1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts.aggregation.util;
17 
18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
19 import com.android.providers.contacts.util.Hex;
20 
21 import android.util.ArrayMap;
22 import android.util.Log;
23 
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.List;
27 
28 /**
29  * Logic for matching contacts' data and accumulating match scores.
30  */
31 public class ContactMatcher {
32     private static final String TAG = "ContactMatcher";
33 
34     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
35     public static final int SCORE_THRESHOLD_SUGGEST = 50;
36 
37     // Automatically aggregate contacts if their match score is equal or greater than this threshold
38     public static final int SCORE_THRESHOLD_PRIMARY = 70;
39 
40     // Automatically aggregate contacts if the match score is equal or greater than this threshold
41     // and there is a secondary match (phone number, email etc).
42     public static final int SCORE_THRESHOLD_SECONDARY = 50;
43 
44     // Score for missing data (as opposed to present data but a bad match)
45     private static final int NO_DATA_SCORE = -1;
46 
47     // Score for matching phone numbers
48     private static final int PHONE_MATCH_SCORE = 71;
49 
50     // Score for matching email addresses
51     private static final int EMAIL_MATCH_SCORE = 71;
52 
53     // Score for matching nickname
54     private static final int NICKNAME_MATCH_SCORE = 71;
55 
56     // Maximum number of characters in a name to be considered by the matching algorithm.
57     private static final int MAX_MATCHED_NAME_LENGTH = 30;
58 
59     public static final int MATCHING_ALGORITHM_EXACT = 0;
60     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
61     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
62 
63     // Minimum edit distance between two names to be considered an approximate match
64     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
65 
66     // Minimum edit distance between two email ids to be considered an approximate match
67     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
68 
69     // Returned value when we found multiple matches and that was not allowed
70     public static final long MULTIPLE_MATCHES = -2;
71 
72     /**
73      * Name matching scores: a matrix by name type vs. candidate lookup type.
74      * For example, if the name type is "full name" while we are looking for a
75      * "full name", the score may be 99. If we are looking for a "nickname" but
76      * find "first name", the score may be 50 (see specific scores defined
77      * below.)
78      * <p>
79      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
80      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
81      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
82      * between the strings is below the threshold.
83      * <p>
84      * We use a string matching algorithm, which is particularly suited for
85      * name matching. See {@link NameDistance}.
86      */
87     private static int[] sMinScore =
88             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
89     private static int[] sMaxScore =
90             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
91 
92     /*
93      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
94      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
95      * not!  They are useful in three-way aggregation cases when we have, for example, both
96      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
97      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
98      * scores than direct matches.
99      */
100     static {
setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)101         setScoreRange(NameLookupType.NAME_EXACT,
102                 NameLookupType.NAME_EXACT, 99, 99);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)103         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
104                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
105 
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)106         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
107                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)108         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
109                 NameLookupType.NICKNAME, 50, 60);
110 
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)111         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
112                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)113         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
114                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)115         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
116                 NameLookupType.NICKNAME, 50, 60);
117 
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)118         setScoreRange(NameLookupType.NICKNAME,
119                 NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)120         setScoreRange(NameLookupType.NICKNAME,
121                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)122         setScoreRange(NameLookupType.NICKNAME,
123                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
124     }
125 
126     /**
127      * Populates the cells of the score matrix and score span matrix
128      * corresponding to the {@code candidateNameType} and {@code nameType}.
129      */
setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)130     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
131         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
132         sMinScore[index] = scoreFrom;
133         sMaxScore[index] = scoreTo;
134     }
135 
136     /**
137      * Returns the lower range for the match score for the given {@code candidateNameType} and
138      * {@code nameType}.
139      */
getMinScore(int candidateNameType, int nameType)140     private static int getMinScore(int candidateNameType, int nameType) {
141         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
142         return sMinScore[index];
143     }
144 
145     /**
146      * Returns the upper range for the match score for the given {@code candidateNameType} and
147      * {@code nameType}.
148      */
getMaxScore(int candidateNameType, int nameType)149     private static int getMaxScore(int candidateNameType, int nameType) {
150         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
151         return sMaxScore[index];
152     }
153 
154     private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>();
155     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
156     private int mScoreCount = 0;
157 
158     private final NameDistance mNameDistanceConservative = new NameDistance();
159     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
160 
getMatchingScore(long contactId)161     private MatchScore getMatchingScore(long contactId) {
162         MatchScore matchingScore = mScores.get(contactId);
163         if (matchingScore == null) {
164             if (mScoreList.size() > mScoreCount) {
165                 matchingScore = mScoreList.get(mScoreCount);
166                 matchingScore.reset(contactId);
167             } else {
168                 matchingScore = new MatchScore(contactId);
169                 mScoreList.add(matchingScore);
170             }
171             mScoreCount++;
172             mScores.put(contactId, matchingScore);
173         }
174         return matchingScore;
175     }
176 
177     /**
178      * Marks the contact as a full match, because we found an Identity match
179      */
matchIdentity(long contactId)180     public void matchIdentity(long contactId) {
181         updatePrimaryScore(contactId, MatchScore.MAX_SCORE);
182     }
183 
184     /**
185      * Checks if there is a match and updates the overall score for the
186      * specified contact for a discovered match. The new score is determined
187      * by the prior score, by the type of name we were looking for, the type
188      * of name we found and, if the match is approximate, the distance between the candidate and
189      * actual name.
190      */
matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)191     public void matchName(long contactId, int candidateNameType, String candidateName,
192             int nameType, String name, int algorithm) {
193         int maxScore = getMaxScore(candidateNameType, nameType);
194         if (maxScore == 0) {
195             return;
196         }
197 
198         if (candidateName.equals(name)) {
199             updatePrimaryScore(contactId, maxScore);
200             return;
201         }
202 
203         if (algorithm == MATCHING_ALGORITHM_EXACT) {
204             return;
205         }
206 
207         int minScore = getMinScore(candidateNameType, nameType);
208         if (minScore == maxScore) {
209             return;
210         }
211 
212         final byte[] decodedCandidateName;
213         final byte[] decodedName;
214         try {
215             decodedCandidateName = Hex.decodeHex(candidateName);
216             decodedName = Hex.decodeHex(name);
217         } catch (RuntimeException e) {
218             // How could this happen??  See bug 6827136
219             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
220             return;
221         }
222 
223         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
224                 mNameDistanceConservative : mNameDistanceApproximate;
225 
226         int score;
227         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
228         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
229                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
230         float threshold = emailBased
231                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
232                 : APPROXIMATE_MATCH_THRESHOLD;
233         if (distance > threshold) {
234             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
235         } else {
236             score = 0;
237         }
238 
239         updatePrimaryScore(contactId, score);
240     }
241 
updateScoreWithPhoneNumberMatch(long contactId)242     public void updateScoreWithPhoneNumberMatch(long contactId) {
243         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
244     }
245 
updateScoreWithEmailMatch(long contactId)246     public void updateScoreWithEmailMatch(long contactId) {
247         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
248     }
249 
updateScoreWithNicknameMatch(long contactId)250     public void updateScoreWithNicknameMatch(long contactId) {
251         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
252     }
253 
updatePrimaryScore(long contactId, int score)254     private void updatePrimaryScore(long contactId, int score) {
255         getMatchingScore(contactId).updatePrimaryScore(score);
256     }
257 
updateSecondaryScore(long contactId, int score)258     private void updateSecondaryScore(long contactId, int score) {
259         getMatchingScore(contactId).updateSecondaryScore(score);
260     }
261 
keepIn(long contactId)262     public void keepIn(long contactId) {
263         getMatchingScore(contactId).keepIn();
264     }
265 
keepOut(long contactId)266     public void keepOut(long contactId) {
267         getMatchingScore(contactId).keepOut();
268     }
269 
clear()270     public void clear() {
271         mScores.clear();
272         mScoreCount = 0;
273     }
274 
275     /**
276      * Returns a list of IDs for contacts that are matched on secondary data elements
277      * (phone number, email address, nickname). We still need to obtain the approximate
278      * primary score for those contacts to determine if any of them should be aggregated.
279      * <p>
280      * May return null.
281      */
prepareSecondaryMatchCandidates(int threshold)282     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
283         ArrayList<Long> contactIds = null;
284 
285         for (int i = 0; i < mScoreCount; i++) {
286             MatchScore score = mScoreList.get(i);
287             if (score.isKeepOut()) {
288                 continue;
289             }
290 
291             int s = score.getSecondaryScore();
292             if (s >= threshold) {
293                 if (contactIds == null) {
294                     contactIds = new ArrayList<Long>();
295                 }
296                 contactIds.add(score.getContactId());
297             }
298             score.setPrimaryScore(NO_DATA_SCORE);
299         }
300         return contactIds;
301     }
302 
303     /**
304      * Returns the contactId with the best match score over the specified threshold or -1
305      * if no such contact is found.  If multiple contacts are found, and
306      * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
307      * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
308      */
pickBestMatch(int threshold, boolean allowMultipleMatches)309     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
310         long contactId = -1;
311         int maxScore = 0;
312         for (int i = 0; i < mScoreCount; i++) {
313             MatchScore score = mScoreList.get(i);
314             if (score.isKeepOut()) {
315                 continue;
316             }
317 
318             if (score.isKeepIn()) {
319                 return score.getContactId();
320             }
321 
322             int s = score.getPrimaryScore();
323             if (s == NO_DATA_SCORE) {
324                 s = score.getSecondaryScore();
325             }
326 
327             if (s >= threshold) {
328                 if (contactId != -1 && !allowMultipleMatches) {
329                     return MULTIPLE_MATCHES;
330                 }
331                 // In order to make it stable, let's jut pick the one with the lowest ID
332                 // if multiple candidates are found.
333                 if ((s > maxScore) || ((s == maxScore) && (contactId > score.getContactId()))) {
334                     contactId = score.getContactId();
335                     maxScore = s;
336                 }
337             }
338         }
339         return contactId;
340     }
341 
342     /**
343      * Returns matches in the order of descending score.
344      */
pickBestMatches(int threshold)345     public List<MatchScore> pickBestMatches(int threshold) {
346         int scaledThreshold = threshold * MatchScore.SCORE_SCALE;
347         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
348         Collections.sort(matches);
349         int count = 0;
350         for (int i = 0; i < mScoreCount; i++) {
351             MatchScore matchScore = matches.get(i);
352             if (matchScore.getScore() >= scaledThreshold) {
353                 count++;
354             } else {
355                 break;
356             }
357         }
358 
359         return matches.subList(0, count);
360     }
361 
362     @Override
toString()363     public String toString() {
364         return mScoreList.subList(0, mScoreCount).toString();
365     }
366 }
367