1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 package com.android.providers.contacts.aggregation.util;
17 
18 import java.util.Arrays;
19 
20 /**
21  * A string distance calculator, particularly suited for name matching.
22 * <p>
23  * A detailed discussion of the topic of record linkage in general and name matching
24  * in particular can be found in this article:
25  * <blockquote>
26  * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions".
27  * Research Report Series, RRS.
28  * </blockquote>
29  */
30 public class NameDistance {
31 
32     private static final float WINKLER_BONUS_THRESHOLD = 0.7f;
33     private static final int MIN_EXACT_PREFIX_LENGTH = 3;
34 
35     private final int mMaxLength;
36     private final boolean mPrefixOnly;
37     private final boolean[] mMatchFlags1;
38     private final boolean[] mMatchFlags2;
39 
40     /**
41      * Constructor.
42      *
43      * @param maxLength byte arrays are truncated if longer than this number
44      */
NameDistance(int maxLength)45     public NameDistance(int maxLength) {
46         mMaxLength = maxLength;
47         mPrefixOnly = false;
48         mMatchFlags1 = new boolean[maxLength];
49         mMatchFlags2 = new boolean[maxLength];
50     }
51 
52     /**
53      * Constructor for a matcher that only checks if one string is the exact prefix of the other
54      */
NameDistance()55     public NameDistance() {
56         mPrefixOnly = true;
57         mMaxLength = 0;
58         mMatchFlags1 = mMatchFlags2 = null;
59     }
60 
61     /**
62      * Computes a string distance between two normalized strings passed as byte arrays.
63      */
getDistance(byte bytes1[], byte bytes2[])64     public float getDistance(byte bytes1[], byte bytes2[]) {
65         byte[] array1, array2;
66 
67         if (bytes1.length > bytes2.length) {
68             array2 = bytes1;
69             array1 = bytes2;
70         } else {
71             array2 = bytes2;
72             array1 = bytes1;
73         }
74 
75         int length1 = array1.length;
76         if (length1 >= MIN_EXACT_PREFIX_LENGTH) {
77             boolean prefix = true;
78             for (int i = 0; i < array1.length; i++) {
79                 if (array1[i] != array2[i]) {
80                     prefix = false;
81                     break;
82                 }
83             }
84             if (prefix) {
85                 return 1.0f;
86             }
87         }
88 
89         if (mPrefixOnly) {
90             return 0;
91         }
92 
93         if (length1 > mMaxLength) {
94             length1 = mMaxLength;
95         }
96 
97         int length2 = array2.length;
98         if (length2 > mMaxLength) {
99             length2 = mMaxLength;
100         }
101 
102         Arrays.fill(mMatchFlags1, 0, length1, false);
103         Arrays.fill(mMatchFlags2, 0, length2, false);
104 
105         int range = length2 / 2 - 1;
106         if (range < 0) {
107             range = 0;
108         }
109 
110         int matches = 0;
111         for (int i = 0; i < length1; i++) {
112             byte c1 = array1[i];
113 
114             int from = i - range;
115             if (from < 0) {
116                 from = 0;
117             }
118 
119             int to = i + range + 1;
120             if (to > length2) {
121                 to = length2;
122             }
123 
124             for (int j = from; j < to; j++) {
125                 if (!mMatchFlags2[j] && c1 == array2[j]) {
126                     mMatchFlags1[i] = mMatchFlags2[j] = true;
127                     matches++;
128                     break;
129                 }
130             }
131         }
132 
133         if (matches == 0) {
134             return 0f;
135         }
136 
137         int transpositions = 0;
138         int j = 0;
139         for (int i = 0; i < length1; i++) {
140             if (mMatchFlags1[i]) {
141                 while (!mMatchFlags2[j]) {
142                     j++;
143                 }
144                 if (array1[i] != array2[j]) {
145                     transpositions++;
146                 }
147                 j++;
148             }
149         }
150 
151         float m = matches;
152         float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3;
153 
154         if (jaro < WINKLER_BONUS_THRESHOLD) {
155             return jaro;
156         }
157 
158         // Add Winkler bonus
159         int prefix = 0;
160         for (int i = 0; i < length1; i++) {
161             if (bytes1[i] != bytes2[i]) {
162                 break;
163             }
164             prefix++;
165         }
166 
167         return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro);
168     }
169 }
170