1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts.aggregation.util;
17 
18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
19 import com.android.providers.contacts.util.Hex;
20 
21 import android.util.ArrayMap;
22 import android.util.Log;
23 
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.List;
27 
28 /**
29  * Logic for matching contacts' data and accumulating match scores.
30  */
31 public class ContactMatcher {
32     private static final String TAG = "ContactMatcher";
33 
34     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
35     public static final int SCORE_THRESHOLD_SUGGEST = 50;
36 
37     // Automatically aggregate contacts if their match score is equal or greater than this threshold
38     public static final int SCORE_THRESHOLD_PRIMARY = 70;
39 
40     // Automatically aggregate contacts if the match score is equal or greater than this threshold
41     // and there is a secondary match (phone number, email etc).
42     public static final int SCORE_THRESHOLD_SECONDARY = 50;
43 
44     // Score for missing data (as opposed to present data but a bad match)
45     private static final int NO_DATA_SCORE = -1;
46 
47     // Score for matching phone numbers
48     private static final int PHONE_MATCH_SCORE = 71;
49 
50     // Score for matching email addresses
51     private static final int EMAIL_MATCH_SCORE = 71;
52 
53     // Score for matching nickname
54     private static final int NICKNAME_MATCH_SCORE = 71;
55 
56     // Maximum number of characters in a name to be considered by the matching algorithm.
57     private static final int MAX_MATCHED_NAME_LENGTH = 30;
58 
59     public static final int MATCHING_ALGORITHM_EXACT = 0;
60     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
61     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
62 
63     // Minimum edit distance between two names to be considered an approximate match
64     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
65 
66     // Minimum edit distance between two email ids to be considered an approximate match
67     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
68 
69     // Returned value when we found multiple matches and that was not allowed
70     public static final long MULTIPLE_MATCHES = -2;
71 
72     /**
73      * Name matching scores: a matrix by name type vs. candidate lookup type.
74      * For example, if the name type is "full name" while we are looking for a
75      * "full name", the score may be 99. If we are looking for a "nickname" but
76      * find "first name", the score may be 50 (see specific scores defined
77      * below.)
78      * <p>
79      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
80      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
81      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
82      * between the strings is below the threshold.
83      * <p>
84      * We use a string matching algorithm, which is particularly suited for
85      * name matching. See {@link NameDistance}.
86      */
87     private static int[] sMinScore =
88             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
89     private static int[] sMaxScore =
90             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
91 
92     /*
93      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
94      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
95      * not!  They are useful in three-way aggregation cases when we have, for example, both
96      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
97      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
98      * scores than direct matches.
99      */
100     static {
setScoreRange(NameLookupType.NAME_EXACT, NameLookupType.NAME_EXACT, 99, 99)101         setScoreRange(NameLookupType.NAME_EXACT,
102                 NameLookupType.NAME_EXACT, 99, 99);
setScoreRange(NameLookupType.NAME_VARIANT, NameLookupType.NAME_VARIANT, 90, 90)103         setScoreRange(NameLookupType.NAME_VARIANT,
104                 NameLookupType.NAME_VARIANT, 90, 90);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NAME_COLLATION_KEY, 50, 80)105         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
106                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
107 
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.EMAIL_BASED_NICKNAME, 30, 60)108         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
109                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
setScoreRange(NameLookupType.NAME_COLLATION_KEY, NameLookupType.NICKNAME, 50, 60)110         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
111                 NameLookupType.NICKNAME, 50, 60);
112 
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)113         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
114                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)115         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
116                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, NameLookupType.NICKNAME, 50, 60)117         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
118                 NameLookupType.NICKNAME, 50, 60);
119 
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NICKNAME, 50, 60)120         setScoreRange(NameLookupType.NICKNAME,
121                 NameLookupType.NICKNAME, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.NAME_COLLATION_KEY, 50, 60)122         setScoreRange(NameLookupType.NICKNAME,
123                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
setScoreRange(NameLookupType.NICKNAME, NameLookupType.EMAIL_BASED_NICKNAME, 50, 60)124         setScoreRange(NameLookupType.NICKNAME,
125                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
126     }
127 
128     /**
129      * Populates the cells of the score matrix and score span matrix
130      * corresponding to the {@code candidateNameType} and {@code nameType}.
131      */
setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo)132     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
133         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
134         sMinScore[index] = scoreFrom;
135         sMaxScore[index] = scoreTo;
136     }
137 
138     /**
139      * Returns the lower range for the match score for the given {@code candidateNameType} and
140      * {@code nameType}.
141      */
getMinScore(int candidateNameType, int nameType)142     private static int getMinScore(int candidateNameType, int nameType) {
143         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
144         return sMinScore[index];
145     }
146 
147     /**
148      * Returns the upper range for the match score for the given {@code candidateNameType} and
149      * {@code nameType}.
150      */
getMaxScore(int candidateNameType, int nameType)151     private static int getMaxScore(int candidateNameType, int nameType) {
152         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
153         return sMaxScore[index];
154     }
155 
156     private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>();
157     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
158     private int mScoreCount = 0;
159 
160     private final NameDistance mNameDistanceConservative = new NameDistance();
161     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
162 
getMatchingScore(long contactId)163     private MatchScore getMatchingScore(long contactId) {
164         MatchScore matchingScore = mScores.get(contactId);
165         if (matchingScore == null) {
166             if (mScoreList.size() > mScoreCount) {
167                 matchingScore = mScoreList.get(mScoreCount);
168                 matchingScore.reset(contactId);
169             } else {
170                 matchingScore = new MatchScore(contactId);
171                 mScoreList.add(matchingScore);
172             }
173             mScoreCount++;
174             mScores.put(contactId, matchingScore);
175         }
176         return matchingScore;
177     }
178 
179     /**
180      * Marks the contact as a full match, because we found an Identity match
181      */
matchIdentity(long contactId)182     public void matchIdentity(long contactId) {
183         updatePrimaryScore(contactId, MatchScore.MAX_SCORE);
184     }
185 
186     /**
187      * Checks if there is a match and updates the overall score for the
188      * specified contact for a discovered match. The new score is determined
189      * by the prior score, by the type of name we were looking for, the type
190      * of name we found and, if the match is approximate, the distance between the candidate and
191      * actual name.
192      */
matchName(long contactId, int candidateNameType, String candidateName, int nameType, String name, int algorithm)193     public void matchName(long contactId, int candidateNameType, String candidateName,
194             int nameType, String name, int algorithm) {
195         int maxScore = getMaxScore(candidateNameType, nameType);
196         if (maxScore == 0) {
197             return;
198         }
199 
200         if (candidateName.equals(name)) {
201             updatePrimaryScore(contactId, maxScore);
202             return;
203         }
204 
205         if (algorithm == MATCHING_ALGORITHM_EXACT) {
206             return;
207         }
208 
209         int minScore = getMinScore(candidateNameType, nameType);
210         if (minScore == maxScore) {
211             return;
212         }
213 
214         final byte[] decodedCandidateName;
215         final byte[] decodedName;
216         try {
217             decodedCandidateName = Hex.decodeHex(candidateName);
218             decodedName = Hex.decodeHex(name);
219         } catch (RuntimeException e) {
220             // How could this happen??  See bug 6827136
221             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
222             return;
223         }
224 
225         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
226                 mNameDistanceConservative : mNameDistanceApproximate;
227 
228         int score;
229         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
230         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
231                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
232         float threshold = emailBased
233                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
234                 : APPROXIMATE_MATCH_THRESHOLD;
235         if (distance > threshold) {
236             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
237         } else {
238             score = 0;
239         }
240 
241         updatePrimaryScore(contactId, score);
242     }
243 
updateScoreWithPhoneNumberMatch(long contactId)244     public void updateScoreWithPhoneNumberMatch(long contactId) {
245         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
246     }
247 
updateScoreWithEmailMatch(long contactId)248     public void updateScoreWithEmailMatch(long contactId) {
249         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
250     }
251 
updateScoreWithNicknameMatch(long contactId)252     public void updateScoreWithNicknameMatch(long contactId) {
253         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
254     }
255 
updatePrimaryScore(long contactId, int score)256     private void updatePrimaryScore(long contactId, int score) {
257         getMatchingScore(contactId).updatePrimaryScore(score);
258     }
259 
updateSecondaryScore(long contactId, int score)260     private void updateSecondaryScore(long contactId, int score) {
261         getMatchingScore(contactId).updateSecondaryScore(score);
262     }
263 
keepIn(long contactId)264     public void keepIn(long contactId) {
265         getMatchingScore(contactId).keepIn();
266     }
267 
keepOut(long contactId)268     public void keepOut(long contactId) {
269         getMatchingScore(contactId).keepOut();
270     }
271 
clear()272     public void clear() {
273         mScores.clear();
274         mScoreCount = 0;
275     }
276 
277     /**
278      * Returns a list of IDs for contacts that are matched on secondary data elements
279      * (phone number, email address, nickname). We still need to obtain the approximate
280      * primary score for those contacts to determine if any of them should be aggregated.
281      * <p>
282      * May return null.
283      */
prepareSecondaryMatchCandidates(int threshold)284     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
285         ArrayList<Long> contactIds = null;
286 
287         for (int i = 0; i < mScoreCount; i++) {
288             MatchScore score = mScoreList.get(i);
289             if (score.isKeepOut()) {
290                 continue;
291             }
292 
293             int s = score.getSecondaryScore();
294             if (s >= threshold) {
295                 if (contactIds == null) {
296                     contactIds = new ArrayList<Long>();
297                 }
298                 contactIds.add(score.getContactId());
299             }
300             score.setPrimaryScore(NO_DATA_SCORE);
301         }
302         return contactIds;
303     }
304 
305     /**
306      * Returns the contactId with the best match score over the specified threshold or -1
307      * if no such contact is found.  If multiple contacts are found, and
308      * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
309      * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
310      */
pickBestMatch(int threshold, boolean allowMultipleMatches)311     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
312         long contactId = -1;
313         int maxScore = 0;
314         for (int i = 0; i < mScoreCount; i++) {
315             MatchScore score = mScoreList.get(i);
316             if (score.isKeepOut()) {
317                 continue;
318             }
319 
320             if (score.isKeepIn()) {
321                 return score.getContactId();
322             }
323 
324             int s = score.getPrimaryScore();
325             if (s == NO_DATA_SCORE) {
326                 s = score.getSecondaryScore();
327             }
328 
329             if (s >= threshold) {
330                 if (contactId != -1 && !allowMultipleMatches) {
331                     return MULTIPLE_MATCHES;
332                 }
333                 // In order to make it stable, let's jut pick the one with the lowest ID
334                 // if multiple candidates are found.
335                 if ((s > maxScore) || ((s == maxScore) && (contactId > score.getContactId()))) {
336                     contactId = score.getContactId();
337                     maxScore = s;
338                 }
339             }
340         }
341         return contactId;
342     }
343 
344     /**
345      * Returns matches in the order of descending score.
346      */
pickBestMatches(int threshold)347     public List<MatchScore> pickBestMatches(int threshold) {
348         int scaledThreshold = threshold * MatchScore.SCORE_SCALE;
349         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
350         Collections.sort(matches);
351         int count = 0;
352         for (int i = 0; i < mScoreCount; i++) {
353             MatchScore matchScore = matches.get(i);
354             if (matchScore.getScore() >= scaledThreshold) {
355                 count++;
356             } else {
357                 break;
358             }
359         }
360 
361         return matches.subList(0, count);
362     }
363 
364     @Override
toString()365     public String toString() {
366         return mScoreList.subList(0, mScoreCount).toString();
367     }
368 }
369